def init_dictionary(self, save=True): import gzip from collections import Counter corpus_file = self.params.get( 'dictionary__corpus_file') or self.params.get( 'corpus_file') or 'sentences.txt.gz' doc_id = 0 num_pos = 0 num_nnz = 0 cfs = Counter() dfs = Counter() f = gzip.open(self.path + corpus_file, 'rt', encoding='utf8') f = tqdm(f, 'dictionary', self.sentences_cnt) unique = set() for line in f: line = line.strip() if not line: # end of document dfs.update(unique) num_nnz += len(unique) # doc_id += 1 unique = set() continue tokens = line.split(' ') cfs.update(tokens) num_pos += len(tokens) unique.update(tokens) f.close() # token2id = {t: i for i, (t, cnt) in enumerate(cfs.most_common())} dictionary = GensimDictionary() dictionary.num_pos = num_pos dictionary.num_nnz = num_nnz dictionary.num_docs = doc_id dictionary.token2id = token2id #dictionary.cfs = {i:cfs[t] for t,i in token2id.items()} #dictionary.dfs = {i:dfs[t] for t,i in token2id.items()} for t, i in token2id.items(): dictionary.cfs[i] = cfs[t] dictionary.dfs[i] = dfs[t] #dictionary.patch_with_special_tokens({'<PAD>':0}) if save: dictionary.save(self.path + 'dictionary.pkl') self.dictionary = dictionary
def pytopia2gensimDict(dict_): ''' Creates gensim dictionary from a pytopia dictionary. This is necessary since building of gensim models requires gensim dictionary but pytopia model builders must be able to receive generic pytopia Dictionary as parameter. ''' # sort dictionary tokens by index dict_ = resolve(dict_) toki = [(tok, dict_[tok]) for tok in dict_] toki.sort(key=lambda ti: ti[1]) # directly set gensim dict data structures, # this works for gensim 0.12.4 gdict = GensimDict() gdict.token2id = {tok: i for tok, i in toki} gdict.id2token = {i: tok for tok, i in toki} gdict.dfs = {tok: 1 for tok, _ in toki} gdict.num_docs = 1 # number of documents processed gdict.num_pos = len(toki) # total number of corpus positions gdict.num_nnz = len(toki) # total number of non-zeroes in the BOW matrix return gdict