"can decide to use numbers as well since rent and dates etc get deleted" #remarks=[re.sub("[^A-Za-z0-9\s]","",x) for x in remarks] "converting sentences to list of words" fin=[] for i in remarks: f1=[] for j in i.split(): f1.append(j) fin.append(f1) from gensim.corpora.dictionary import Dictionary dictionary = Dictionary(fin) dict_fin=dict([(x[0],x[1]) for x in dictionary.iteritems()]) dict_fin[1]#also rev_dict_fin=dict([(x[1],x[0]) for x in dictionary.iteritems()]) rev_dict_fin["also"]#1 #lots fo spelling mistakes, we can use character level model!! from gensim.models import Word2Vec model=Word2Vec(fin,min_count=1) #model["also"] model.most_similar("nyasa") len(list(model.wv.vocab)) model.wv.get_vector("also") """