def load_lda_model(lda_model_name=None, mallet=False): if os.path.isfile(lda_model_name): if mallet: lda_model = LdaMallet.load(lda_model_name) else: lda_model = LdaModel.load(lda_model_name) return lda_model return None
def build_lda(text_corpus=None, dictionary=None, n_topics=10, mallet=True, dataname="none"): """ Given a text corpus builds an LDA model (mallet or gensim) and saves it. :param text_corpus: text corpus *not* BOW!! :param dictionary: dictionary defining tokens to id :param n_topics: number of tokens :param mallet: mallet LDA or gensim LDA :param dataname: basename of the LDA model :return: the name of the LDA model """ if mallet: mallet_path = os.environ.get("MALLETPATH") lda_model = LdaMallet(mallet_path, corpus=text_corpus, num_topics=n_topics, id2word=dictionary, workers=4, optimize_interval=10, iterations=1000, prefix=os.path.join(os.getcwd(), 'mallet/')) else: lda_model = LdaModel(text_corpus, id2word=dictionary, num_topics=n_topics, distributed=False, chunksize=2000, passes=5, update_every=10, alpha='asymmetric', eta=0.1, decay=0.5, eval_every=10, iterations=1000, gamma_threshold=0.001) lda_model_name = make_lda_model_name(dataname, n_topics=n_topics, mallet=mallet) lda_model.save(lda_model_name) return lda_model_name
def calculate_lda(dataset_raw, n_topics=10, lda_model_name="", mallet=True, mallet_path="/Users/verasazonova/no-backup/JARS/mallet-2.0.7/bin/mallet", dataname="none"): with open(dataname+"_log.txt", 'a') as fout: if dataset_raw.include_date: dates = [text[1] for text in dataset_raw] dataset = [normalize_words(text[0].split(), dataset_raw.stoplist) for text in dataset_raw] else: dates = ["" for _ in dataset_raw] dataset = dataset_raw bi_grams = Phrases(dataset, threshold=3) dataset = bi_grams[dataset] dictionary = Dictionary(dataset) dictionary.filter_extremes(no_below=1, no_above=0.9) bow_corpus = [dictionary.doc2bow(text) for text in dataset] fout.write("# Topics: %s\n" % n_topics) if not os.path.isfile(lda_model_name): if mallet: lda_model = LdaMallet(mallet_path, corpus=bow_corpus, num_topics=n_topics, id2word=dictionary, workers=4, optimize_interval=10, iterations=1000) lda_model_name = "lda_model_mallet_%s_%i" % (dataname, n_topics) else: lda_model = LdaModel(bow_corpus, id2word=dictionary, num_topics=n_topics, distributed=False, chunksize=2000, passes=5, update_every=10, alpha='asymmetric', eta=0.1, decay=0.5, eval_every=10, iterations=1000, gamma_threshold=0.001) lda_model_name = "lda_model_%s_%i" % (dataname, n_topics) lda_model.save(lda_model_name) else: if mallet: lda_model = LdaMallet.load(lda_model_name) else: lda_model = LdaModel.load(lda_model_name) topic_definition = [] for i, topic in enumerate(lda_model.show_topics(n_topics, num_words=20, formatted=False)): fout.write("%i \n" % i) topic_list = [] freq_list = [] a_list = [] for tup in topic: topic_list.append(tup[1]) freq_list.append(dictionary.dfs[ dictionary.token2id[tup[1]] ] ) a_list.append(tup[0]) fout.write( "%s\n\n" % repr((sorted(zip(topic_list, freq_list), key=itemgetter(1) )))) topic_definition.append("%i, %s" %(i, repr(" ".join(sorted(topic_list)))[2:-1])) fout.write("Total number of documents: %i\n" % dictionary.num_docs ) earliest_date = dateutil.parser.parse("Sun Jun 08 00:00:00 +0000 2014") a = [tup for tup in sorted(zip(bow_corpus, dates), key=get_date ) if dateutil.parser.parse(tup[1]) > earliest_date] print len(a) print a[len(a)-1] latest_date = dateutil.parser.parse(a[len(a)-1][1]) num_bins = 100 time_span = latest_date - earliest_date print time_span time_bin = time_span / num_bins print time_bin bin_lows = [earliest_date] bin_high = earliest_date + time_bin counts = [[0 for _ in range(n_topics)] for _ in range(num_bins+1)] i=0 for text in a: topic_assignments = lda_model[text[0]] date_str = text[1] if date_str is not None: cur_date = dateutil.parser.parse(date_str) if cur_date >= bin_high: i+=1 bin_lows.append(bin_high) bin_high = bin_lows[len(bin_lows)-1] + time_bin #counts[i][max(topic_assignments, key=itemgetter(1))[0]] += 1 for tup in topic_assignments: counts[i][tup[0]] += tup[1] fout.write("Number of documents assigned mostly to the topic: \n") fout.write("%s\n" % counts) a = 1.*np.array(counts) np.savetxt("mpeketoni_cnts.txt", a) with open("mpeketoni_bins.txt", 'w') as fout: for date in bin_lows: fout.write("%s\n" % date) with open("mpeketoni_labels.txt", 'w') as fout: for label in topic_definition: fout.write("%s\n" % label) return a, bin_lows, topic_definition