def text_to_num(texts): texts = [clean(text) for text in texts] common_dictionary = Dictionary(texts) common_dictionary.id2token = dict([(common_dictionary.token2id[x]+1,x) for x in common_dictionary.token2id.keys()]) common_corpus = [[common_dictionary.token2id[word]+1 for word in text] for text in texts] return common_corpus, common_dictionary.token2id, common_dictionary.id2token
def preprocess_corpora(corpora, stopwords, allowed_pos, max_doc=float('inf'), no_above=0.5, no_below=1, keep_n=None): """ :rtype : gensim.corpora.dictionary.Dictionary :param corpora: :param stopwords: :param allowed_pos: :param max_doc: :return: """ logging.info('Lemmatizing the corpora...') count = 0 corpus_num = len(corpora) processed_corpora = [] corpus_id2orig_id = [] for index, corpus in corpora.items(): count += 1 if count > max_doc: break if corpus is None: # skip if corpus is None continue print '\r', count, '/', corpus_num, cleaned_corpus = clean_text(corpus) # delete irrelevant characters corpus = [] tokens = lemmatize(content=cleaned_corpus, allowed_tags=allowed_pos) for token in tokens: word, pos = token.split('/') corpus.append(word) # convert compound word into one token corpus = convert_compound(corpus) # filter stop words, long words, and non-english words corpus = [w for w in corpus if not w in stopwords and 2 <= len(w) <= 15 and w.islower()] processed_corpora.append(corpus) corpus_id2orig_id.append(index) print '\n' logging.info('Creating dictionary and corpus...') dictionary = Dictionary(processed_corpora) dictionary.corpus_id2orig_id = corpus_id2orig_id logging.info('Filtering unimportant terms...') dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n) dictionary.compactify() logging.info('Generating corpus...') dictionary.corpus = [dictionary.doc2bow(corpus) for corpus in processed_corpora] dictionary.id2token = revdict(dictionary.token2id) return dictionary
def setUp(self): # Set up toy example for better understanding and testing # of this module. See the modules for the mathematical formulas self.topics = [np.array([1, 2])] # Result from s_one_set segmentation: self.segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] self.gamma = 1 self.measure = 'nlr' dictionary = Dictionary() dictionary.id2token = {1: 'fake', 2: 'tokens'} self.accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) self.accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} self.accumulator._num_docs = 5
def pytopia2gensimDict(dict_): ''' Creates gensim dictionary from a pytopia dictionary. This is necessary since building of gensim models requires gensim dictionary but pytopia model builders must be able to receive generic pytopia Dictionary as parameter. ''' # sort dictionary tokens by index dict_ = resolve(dict_) toki = [(tok, dict_[tok]) for tok in dict_] toki.sort(key=lambda ti: ti[1]) # directly set gensim dict data structures, # this works for gensim 0.12.4 gdict = GensimDict() gdict.token2id = {tok: i for tok, i in toki} gdict.id2token = {i: tok for tok, i in toki} gdict.dfs = {tok: 1 for tok, _ in toki} gdict.num_docs = 1 # number of documents processed gdict.num_pos = len(toki) # total number of corpus positions gdict.num_nnz = len(toki) # total number of non-zeroes in the BOW matrix return gdict
from gensim.models import CoherenceModel path_to_mallet_binary = 'd:/mallet-2.0.8/bin/mallet' output_path = 'd:/code/gc_text_analysis/mallet_output/' num_topics = 140 model = LdaMallet(path_to_mallet_binary, corpus=bow_docs, workers=4, iterations=2000, num_topics=num_topics, id2word=dictionary, prefix=output_path) model.save('gc_lda_model.pkl') dictionary.id2token = dict((v, k) for k, v in dictionary.token2id.items()) words_freq = [(dictionary.id2token[id], cnt) for id, cnt in dictionary.dfs.items()] words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) words_freq = pd.DataFrame(words_freq, columns=['word', 'count']) coherence_model_lda = CoherenceModel(model=model, texts=ngram_docs, dictionary=dictionary, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() topics = model.show_topics(num_topics=num_topics, num_words=10, log=False, formatted=False)