def create_dictionary(self): """ Utility method to generate gensim-style Dictionary directly from the corpus and vocabulary data. """ dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors # uci vocabularies may contain terms that are not used in the document data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word dictionary.token2id = dict((v, k) for k, v in iteritems(self.id2word)) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: logger.info('PROGRESS: processing document %i of %i' % (docno, self.num_docs)) for word, count in doc: dictionary.dfs[word] += 1 dictionary.num_pos += count return dictionary
def _load_gensim_dictionary(self): setattr(self, '_index2id', {}) gensim_dict = GensimDictionary() gensim_dict.num_docs = self.num_docs gensim_dict.num_pos = self.num_pos gensim_dict.num_nnz = self.num_nnz for dic_token in self.dic_tokens.all(): self._index2id[dic_token.index] = dic_token.id gensim_dict.token2id[dic_token.text] = dic_token.index gensim_dict.dfs[dic_token.index] = dic_token.document_frequency logger.info("Dictionary contains %d tokens" % len(gensim_dict.token2id)) return gensim_dict
def create_dictionary(self): """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data. Return ------ :class:`gensim.corpora.dictionary.Dictionary` Dictionary, based on corpus. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora.ucicorpus import UciCorpus >>> from gensim.test.utils import datapath >>> ucc = UciCorpus(datapath('testcorpus.uci')) >>> dictionary = ucc.create_dictionary() """ dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors # uci vocabularies may contain terms that are not used in the document data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word dictionary.token2id = utils.revdict(self.id2word) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs) for word, count in doc: dictionary.dfs[word] += 1 dictionary.num_pos += count return dictionary