def create_dictionary(self): """ Utility method to generate gensim-style Dictionary directly from the corpus and vocabulary data. """ dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors # uci vocabularies may contain terms that are not used in the document data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word dictionary.token2id = utils.revdict(self.id2word) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs) for word, count in doc: dictionary.dfs[word] += 1 dictionary.num_pos += count return dictionary
def load_word_topics(self): """Load words X topics matrix from :meth:`gensim.models.wrappers.ldamallet.LdaMallet.fstate` file. Returns ------- numpy.ndarray Matrix words X topics. """ logger.info("loading assigned topics from %s", self.fstate()) word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float64) if hasattr(self.id2word, 'token2id'): word2id = self.id2word.token2id else: word2id = revdict(self.id2word) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # noqa:F841 beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split(" ") if token not in word2id: continue tokenid = word2id[token] word_topics[int(topic), tokenid] += 1.0 return word_topics
def load_word_topics(self): logger.info("loading assigned topics from %s", self.fstate()) word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float64) if hasattr(self.id2word, 'token2id'): word2id = self.id2word.token2id else: word2id = revdict(self.id2word) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.array( [float(val) for val in next(fin).split()[2:]]) assert len( self.alpha ) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # noqa:F841 beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split(" ") if token not in word2id: continue tokenid = word2id[token] word_topics[int(topic), tokenid] += 1.0 return word_topics
def load_word_topics(self): """Load words X topics matrix from :meth:`gensim.models.wrappers.ldamallet.LdaMallet.fstate` file. Returns ------- numpy.ndarray Matrix words X topics. """ logger.info("loading assigned topics from %s", self.fstate()) word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float64) if hasattr(self.id2word, 'token2id'): word2id = self.id2word.token2id else: word2id = revdict(self.id2word) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.fromiter(next(fin).split()[2:], dtype=float) assert len( self.alpha ) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # noqa:F841 beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split(" ") if token not in word2id: continue tokenid = word2id[token] word_topics[int(topic), tokenid] += 1.0 return word_topics
def reverse(self, documents_idx, unknown_word="UNKNOWN"): if len(self.id2token) != len(self.token2id): self.id2token = utils.revdict(self.token2id) rst = [] for doc_idx in documents_idx: rst_i = [] for idx in doc_idx: rst_i.append(self.id2token.get(idx, unknown_word)) rst.append(rst_i) return rst
def saveLDACorpus(train_data_path,test_data_path,model_file,dictionary_file,corpus_file): "" lda = LdaModel.load(model_file) dictionary = Dictionary.load_from_text(dictionary_file) dictionary.id2token = utils.revdict(dictionary.token2id) src_df = pd.read_csv(corpus_file) src_df = parallelize(src_df, data_fram_proc1,dictionary ,lda) #计入ida特征 train_data, test_data = train_test_split(src_df[['label','multiLabels','item']], test_size=0.2, random_state=42) train_data.to_csv(train_data_path, index=None )#, header=None test_data.to_csv(test_data_path, index=None )#, header=None
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None): """ :rtype : gensim.corpora.dictionary.Dictionary :param corpora: :param stopwords: :param allowed_pos: :param max_doc: :return: """ logging.info('Lemmatizing the corpora...') count = 0 corpus_num = len(corpora) processed_corpora = [] corpus_id2orig_id = [] for index, corpus in corpora.items(): count += 1 if count > max_doc: break if corpus is None: # skip if corpus is None continue print '\r', count, '/', corpus_num, cleaned_corpus = clean_text(corpus) # delete irrelevant characters corpus = [] tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos) for token in tokens: word, pos = token.split('/') corpus.append(word) # convert compound word into one token corpus = convert_compound(corpus) # filter stop words, long words, and non-english words corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] processed_corpora.append(corpus) corpus_id2orig_id.append(index) print '\n' logging.info('Creating dictionary and corpus...') dictionary = Dictionary(processed_corpora) dictionary.corpus_id2orig_id = corpus_id2orig_id logging.info('Filtering unimportant terms...') dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() logging.info('Generating corpus...') dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora] dictionary.id2token = revdict(dictionary.token2id) return dictionary
def create_dictionary(self): """Generate :class:`gensim.corpora.dictionary.Dictionary` directly from the corpus and vocabulary data. Return ------ :class:`gensim.corpora.dictionary.Dictionary` Dictionary, based on corpus. Examples -------- .. sourcecode:: pycon >>> from gensim.corpora.ucicorpus import UciCorpus >>> from gensim.test.utils import datapath >>> ucc = UciCorpus(datapath('testcorpus.uci')) >>> dictionary = ucc.create_dictionary() """ dictionary = Dictionary() # replace dfs with defaultdict to avoid downstream KeyErrors # uci vocabularies may contain terms that are not used in the document data dictionary.dfs = defaultdict(int) dictionary.id2token = self.id2word dictionary.token2id = utils.revdict(self.id2word) dictionary.num_docs = self.num_docs dictionary.num_nnz = self.num_nnz for docno, doc in enumerate(self): if docno % 10000 == 0: logger.info('PROGRESS: processing document %i of %i', docno, self.num_docs) for word, count in doc: dictionary.dfs[word] += 1 dictionary.num_pos += count return dictionary
def load_word_topics(self): logger.info("loading assigned topics from %s", self.fstate()) word_topics = numpy.zeros((self.num_topics, self.num_terms), dtype=numpy.float32) if hasattr(self.id2word, 'token2id'): word2id = self.id2word.token2id else: word2id = revdict(self.id2word) with utils.smart_open(self.fstate()) as fin: _ = next(fin) # header self.alpha = numpy.array([float(val) for val in next(fin).split()[2:]]) assert len(self.alpha) == self.num_topics, "mismatch between MALLET vs. requested topics" _ = next(fin) # beta for lineno, line in enumerate(fin): line = utils.to_unicode(line) doc, source, pos, typeindex, token, topic = line.split(" ") if token not in word2id: continue tokenid = word2id[token] word_topics[int(topic), tokenid] += 1.0 return word_topics
def __getitem__(self, tokenid): """Get the string token that corresponds to `tokenid`. Parameters ---------- tokenid : int Id of token. Returns ------- str Token corresponding to `tokenid`. Raises ------ KeyError If this Dictionary doesn't contain such `tokenid`. """ if len(self.id2token) != len(self.token2id): # the word->id mapping has changed (presumably via add_documents); # recompute id->word accordingly self.id2token = utils.revdict(self.token2id) return self.id2token[tokenid] # will throw for non-existent ids
def __getitem__(self, tokenid): """Get token by provided `tokenid`. Parameters ---------- tokenid : int Id of token Returns ------- str Token corresponding to `tokenid`. Raises ------ KeyError If `tokenid` isn't contained in :class:`~gensim.corpora.dictionary.Dictionary`. """ if len(self.id2token) != len(self.token2id): # the word->id mapping has changed (presumably via add_documents); # recompute id->word accordingly self.id2token = utils.revdict(self.token2id) return self.id2token[tokenid] # will throw for non-existent ids
def id2word(self, val): self._id2word = val self.word2id = utils.revdict(val)
if 'word2vec' in program: if os.path.exists(outf('w2v')): logger.info("word2vec model found, loading") model = utils.unpickle(outf('w2v')) else: logger.info("word2vec model not found, creating") if NEGATIVE: model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS, hs=0, negative=NEGATIVE) else: model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS) model.build_vocab(corpus()) model.train(corpus()) # train with 1 epoch model.init_sims(replace=True) model.word2id = dict((w, v.index) for w, v in model.vocab.iteritems()) model.id2word = utils.revdict(model.word2id) model.word_vectors = model.syn0norm utils.pickle(model, outf('w2v')) if 'glove' in program: if os.path.exists(outf('glove')): logger.info("glove model found, loading") model = utils.unpickle(outf('glove')) else: if os.path.exists(outf('glove_corpus')): logger.info("glove corpus matrix found, loading") cooccur = utils.unpickle(outf('glove_corpus')) else: logger.info("glove corpus matrix not found, creating") cooccur = glove.Corpus(dictionary=word2id) cooccur.fit(corpus(), window=WINDOW)
def show_documents_bow(corpus): st.markdown("Bag-of-words representation of the documents:") tcid = utils.revdict(corpus.dictionary.token2id) st.dataframe([[(tcid[t], w) for (t, w) in doc] for doc in corpus.bow()])
min_count=0, window=WINDOW, workers=WORKERS, hs=0, negative=NEGATIVE) else: model = gensim.models.Word2Vec(size=DIM, min_count=0, window=WINDOW, workers=WORKERS) model.build_vocab(corpus()) model.train(corpus()) # train with 1 epoch model.init_sims(replace=True) model.word2id = dict( (w, v.index) for w, v in model.vocab.iteritems()) model.id2word = utils.revdict(model.word2id) model.word_vectors = model.syn0norm utils.pickle(model, outf('w2v')) if 'glove' in program: if os.path.exists(outf('glove')): logger.info("glove model found, loading") model = utils.unpickle(outf('glove')) else: if os.path.exists(outf('glove_corpus')): logger.info("glove corpus matrix found, loading") cooccur = utils.unpickle(outf('glove_corpus')) else: logger.info("glove corpus matrix not found, creating") cooccur = glove.Corpus(dictionary=word2id) cooccur.fit(corpus(), window=WINDOW)
def __getitem__(self, tokenid): if len(self.id2token) != len(self.token2id): # the word->id mapping has changed (presumably via add_documents); # recompute id->word accordingly self.id2token = utils.revdict(self.token2id) return self.id2token[tokenid] # will throw for non-existent ids
word, pos = token.split('/') document.append(word) # convert compound word into one token document = convert_compound(document) # filter stop words, long words, and non-english words document = [w for w in document if not w in stop_words and 2 <= len(w) <= 15 and w.islower()] new_documents.append(document) titles.append(index) froms.append(from_name) dates.append(date) print '\n' logging.info('create dictionary and corpus...') dictionary = corpora.Dictionary(new_documents) dictionary.docid2title = titles dictionary.docid2from = froms dictionary.docid2date = dates logging.info('filter unimportant words...') dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None) dictionary.compactify() logging.info('generate corpus...') dictionary.corpus = [dictionary.doc2bow(document) for document in new_documents] dictionary.id2token = revdict(dictionary.token2id) dictionary.save('data/dictionary/report_' + allowed_pos.pattern + '.dict')
def contexts(document, window): assert(window % 2 == 0) half_window = window/2 for i in range(half_window, len(document) - window): yield document[i:i + half_window], document[i + half_window], document[i + half_window + 1: i + window + 1] if __name__ == "__main__": frame = pd.read_csv("/Users/vitillo/Downloads/labeledTrainData.tsv", sep='\t') sentences = extract_sentences(frame["review"]) dictionary = corpora.Dictionary(sentences) dictionary.filter_extremes(no_below=5, no_above=0.6, keep_n=1000) rev_dictionary = revdict(dictionary) with open('data.csv', 'w') as f: for sentence in sentences: document = [rev_dictionary[w] for w in sentence if w in rev_dictionary] for context in contexts(document, 4): prefix, word, postfix = context f.write(",".join(map(str, prefix + postfix + [word]))) f.write("\n") dictionary.save_as_text("dictionary.tsv")
def __init__(self, corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, defined_kws={}, tfMod=None): """ If given, start training from the iterable `corpus` straight away. If not given, the model is left untrained (presumably because you want to call `update()` manually). `num_topics` is the number of requested latent topics to be extracted from the training corpus. `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. `alpha` can be set to an explicit array = prior of your choice. It also support special values of 'asymmetric' and 'auto': the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word distributions, or a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis. This may be useful if you want to seed certain topics with particular words by boosting the priors for those words. It also supports the special value 'auto', which learns an asymmetric prior directly from your data. Turn on `distributed` to force distributed computing (see the `web tutorial <http://radimrehurek.com/gensim/distributed.html>`_ on how to set up a cluster of machines for gensim). Calculate and log perplexity estimate from the latest mini-batch every `eval_every` model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. `minimum_probability` controls filtering the topics returned for a document (bow). Example: >>> lda = LdaModel(corpus, num_topics=100) # train model >>> print(lda[doc_bow]) # get topic probability distribution for a document >>> lda.update(corpus2) # update the LDA model with additional documents >>> print(lda[doc_bow]) >>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data """ # store user-supplied parameters self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError( 'at least one of corpus/id2word must be specified, to establish input space dimensionality' ) if self.id2word is None: logger.warning( "no word id mapping provided; initializing from corpus, assuming identity" ) self.id2word = utils.dict_from_corpus(corpus) self.num_terms = len(self.id2word) elif len(self.id2word) > 0: self.num_terms = 1 + max(self.id2word.keys()) else: self.num_terms = 0 if self.num_terms == 0: raise ValueError( "cannot compute LDA over an empty collection (no terms)") self.distributed = bool(distributed) self.num_topics = int(num_topics) self.chunksize = chunksize self.decay = decay self.offset = offset self.minimum_probability = minimum_probability self.num_updates = 0 self.passes = passes self.update_every = update_every self.eval_every = eval_every self.alpha, self.optimize_alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == ( num_topics, ), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str( self.alpha.shape), num_topics) self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta') assert ( self.eta.shape == (num_topics, 1) or self.eta.shape == (num_topics, self.num_terms) ), ("Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % (str(self.eta.shape), num_topics, num_topics, self.num_terms)) # VB constants self.iterations = iterations self.gamma_threshold = gamma_threshold # set up distributed environment if necessary if not distributed: logger.info("using serial LDA version on this node") self.dispatcher = None self.numworkers = 1 else: if self.optimize_alpha: raise NotImplementedError( "auto-optimizing alpha not implemented in distributed LDA") # set up distributed version try: import Pyro4 dispatcher = Pyro4.Proxy('PYRONAME:gensim.lda_dispatcher') logger.debug("looking for dispatcher at %s" % str(dispatcher._pyroUri)) dispatcher.initialize(id2word=self.id2word, num_topics=num_topics, chunksize=chunksize, alpha=alpha, eta=eta, distributed=False) self.dispatcher = dispatcher self.numworkers = len(dispatcher.getworkers()) logger.info("using distributed version with %i workers" % self.numworkers) except Exception as err: logger.error("failed to initialize distributed LDA (%s)", err) raise RuntimeError( "failed to initialize distributed LDA (%s)" % err) # Initialize the variational distribution q(beta|lambda) self.state = LdaState(self.eta, (self.num_topics, self.num_terms)) self.state.sstats = numpy.random.gamma( 100., 1. / 100., (self.num_topics, self.num_terms)) #reassign word/topic for specific words self.word2id = utils.revdict(self.id2word) sstats = self.state.sstats self.defined_kws = defined_kws self.defined_wordids = {} for w, t in defined_kws.iteritems(): if w in self.word2id: wid = self.word2id[w] self.defined_wordids[wid] = numpy.array(list(t)) for wid, t in self.defined_wordids.iteritems(): sstats[:, wid] = numpy.random.gamma(0.1, 0.05, (self.num_topics, )) for wid, topics in self.defined_wordids.iteritems(): if tfMod is not None: score = self.num_topics * tfMod.idfs.get(wid, 1.0) else: score = self.num_topics if topics.shape[0] > 1: # score = self.num_topics / math.log(len(topics) + 1) score = 0.4 * self.num_topics # print 'score :{}'.format(score) # print self.num_topics # print topics # for t in topics: sstats[topics, wid] = score self.expElogbeta = numpy.exp(dirichlet_expectation(self.state.sstats)) # if a training corpus was provided, start estimating the model right away if corpus is not None: use_numpy = self.dispatcher is not None self.update(corpus, chunks_as_numpy=use_numpy)
outf = lambda prefix: os.path.join(output_dir, prefix) logger.info("output file template will be %s" % outf('PREFIX')) sentences = MyCorpus(corpus_path) if os.path.exists(outf('word2id')): logger.info("dictionary found, loading") word2id = utils.unpickle(outf('word2id')) else: logger.info("dictionary not found, creating") id2word = corpora.Dictionary(sentences, prune_at=10000000) id2word.filter_extremes( keep_n=TOKEN_LIMIT) # filter out too freq/infreq words word2id = dict((v, k) for k, v in id2word.iteritems()) utils.pickle(word2id, outf('word2id')) id2word = utils.revdict(word2id) # Filter all wiki documents to contain only those words. corpus = lambda: ([word for word in sentence if word in word2id] for sentence in sentences) if os.path.exists(outf('kw2v_%s' % GAMMA)): logger.info("Kernel word2vec model found, loading") # model = utils.unpickle(outf('kw2v')) model = Word2Vec.load_word2vec_format(outf('kw2v_%s' % GAMMA), binary=True) else: logger.info("Kernel word2vec model not found, creating") if NEGATIVE: model = Word2Vec(size=DIM, gamma=GAMMA,