def test_lda_params(self): n_topics = 10 model1 = guidedlda.GuidedLDA(n_topics, alpha=0.3) self.assertIsNotNone(model1) model2 = guidedlda.GuidedLDA(n_topics=n_topics, alpha=0.3, eta=0.4) self.assertIsNotNone(model2) self.assertRaises(ValueError, guidedlda.GuidedLDA, n_topics, alpha=-3) self.assertRaises(ValueError, guidedlda.GuidedLDA, n_topics, eta=-3) self.assertRaises(ValueError, guidedlda.GuidedLDA, n_topics, alpha=-3, eta=-3)
def fit(self, dtm, seed_topics=None, seed_confidence=None): """ Fits topic model using guidedlda model. Args: dtm = numpy array or pandas dataframe, document-term-matrix guided = boolean, guided LDA or regular LDA seed_topics = list, list of words belonging to a topic seed_confidence = float, confidence of seed_topics n_topics = int, number of topics to model n_iter = int, number of iterations random_state = int, refresh = int, Returns: model = guidedlda object, fitted topic model """ # check if guided if (bool(seed_topics) is False) and (bool(seed_confidence) is False): guided = False else: guided = True # convert dtm to numpy array if input is in pandas if isinstance(dtm, pd.DataFrame): dtm = np.array(dtm) if not isinstance(dtm, np.ndarray): raise ValueError( 'Please input a valid pandas dataframe or numpy array for dtm!' ) # fit LDA model if guided: if not isinstance(seed_topics, dict): raise ValueError("Please enter a dictionary for seed_topics.") elif not isinstance(seed_confidence, float): raise ValueError("Please enter a float for seed_confidence.") elif self.n_topics < len(seed_topics): raise ValueError( "n_topics must be greater than number of seed topics!") print("Guided LDA") model = guidedlda.GuidedLDA(n_topics=self.n_topics, n_iter=self.n_iter, random_state=self.random_state, refresh=self.refresh) model._fit(dtm, seed_topics, seed_confidence) elif not guided: print("Regular LDA") model = guidedlda.GuidedLDA(n_topics=self.n_topics, n_iter=self.n_iter, random_state=self.random_state, refresh=self.refresh) model.fit(dtm) self.model = model return model
def test_lda_getting_started(self): X = np.array([[1, 1], [2, 1], [3, 1], [4, 1], [5, 8], [6, 1]]) model = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=1) doc_topic = model.fit_transform(X) self.assertIsNotNone(doc_topic) self.assertIsNotNone(model.doc_topic_) self.assertIsNotNone(model.components_)
def setUpClass(cls): test_dir = os.path.dirname(__file__) nyt_ldac_fn = os.path.join(test_dir, 'nyt.ldac') vocab = guidedlda.datasets.load_vocab(guidedlda.datasets.NYT) cls.word2id = word2id = dict((v, idx) for idx, v in enumerate(vocab)) cls.dtm = dtm = guidedlda.utils.ldac2dtm(open(nyt_ldac_fn), offset=0) cls.n_iter = n_iter = 1 cls.n_topics = n_topics = 5 cls.random_seed = random_seed = 1 cls.model = model = guidedlda.GuidedLDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed) cls.seed_topic_list = [ ['game', 'team', 'win', 'player', 'season', 'second', 'victory'], [ 'percent', 'company', 'market', 'price', 'sell', 'business', 'stock', 'share' ], ['music', 'write', 'art', 'book', 'world', 'film'], [ 'political', 'government', 'leader', 'official', 'state', 'country' ] ] cls.seed_topics = seed_topics = {} for t_id, st in enumerate(cls.seed_topic_list): for word in st: seed_topics[word2id[word]] = t_id cls.doc_topic = model.fit_transform(dtm, seed_topics=seed_topics, seed_confidence=0.15)
def do_lda(seed_topics, data): model = guidedlda.GuidedLDA(n_topics=30, n_iter=5000, random_state=7, refresh=10) model.fit(data, seed_topics=seed_topics, seed_confidence=0.25) pickle.dump(model, open("guidedlda_30.pickle", "wb")) return model
def non_guided_analysis(X, vocab, topic_num, n_top_words=TOP_K_WORDS): """ Non_guided Analysis on the given dtm """ model = guidedlda.GuidedLDA(n_topics=topic_num, n_iter=ITERATION_NUMS, random_state=7, refresh=20) model.fit(X) retrieve_words_from(model, vocab, topic_num, n_top_words)
def test_guidedlda_getting_started(self): X = np.array([[1, 0], [2, 0], [3, 0], [4, 0], [0, 8], [6, 0]]) model = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=1) seed_topics = {0: 0, 1: 1} doc_topic = model.fit_transform(X, seed_topics=seed_topics, seed_confidence=0.9) self.assertIsNotNone(doc_topic) self.assertIsNotNone(model.doc_topic_) self.assertIsNotNone(model.components_) self.assertEqual(model.word_topic_[0].argmax(), 0) self.assertEqual(model.word_topic_[1].argmax(), 1)
def main(folder): word2idx = pickle.load(open(os.path.join(folder, "word_idx.p"), "rb")) print(word2idx) # Load seed topics seed_topics_dic, topics = seed_topics(word2idx) idx_to_word = {v: k for k, v in word2idx.items()} # Load data print("Starting training...") lda = guidedlda.GuidedLDA(n_topics=len(topics), n_iter=100, random_state=7, refresh=20) ## Concat data row, col, data = np.array(()), np.array(()), np.array(()) matrix_data_list = glob.glob(os.path.join(folder, "matrix_data_*.p")) np.random.shuffle(matrix_data_list) for doc in tqdm.tqdm(matrix_data_list): print("Partial fitting", doc) res = pickle.load(open(doc, "rb")) row = np.append(row, np.int32(res["I"])) col = np.append(col, np.int32(res["J"])) data = np.append(data, np.int32(res["data"])) X = coo_matrix((np.int32(data), (np.int32(row), np.int32(col)))) lda.fit(X, seed_topics=seed_topics_dic, seed_confidence=0) print("Training done") def print_top_words(model, n_top_words): for topic_idx, topic in enumerate(model.components_): message = "Topic #{} - {}: ".format(topic_idx, topics[topic_idx]) message += " ".join([idx_to_word[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) print(message) def print_sentence_and_topic(sentence, topic): print(colored("Sentence:", "blue"), colored(sentence, "green")) print(colored("Topic: ", "blue"), colored(topic, "red")) print_top_words(lda, 20) np.save(open(os.path.join(folder, "guided_components.npy"), "wb"), lda.components_) ## Test for input sentences stemmer = WordNetLemmatizer() while True: sentence = input() list_words = [w.lower() for w in sentence.split()] np_array = np.zeros([1, len(word2idx.keys())]) for word in list_words: stemmed_word = stemmer.lemmatize(word) if stemmed_word in word2idx: print(stemmed_word) np_array[0, word2idx[stemmed_word]] += 1 topic_dist = lda.transform(np.int32(np_array)) print_sentence_and_topic(sentence, topics[np.argmax(topic_dist)])
def setUpClass(cls): test_dir = os.path.dirname(__file__) reuters_ldac_fn = os.path.join(test_dir, 'reuters.ldac') cls.dtm = scipy.sparse.csr_matrix( guidedlda.utils.ldac2dtm(open(reuters_ldac_fn), offset=0)).astype(np.int64) cls.n_iter = n_iter = 1 cls.n_topics = n_topics = 10 cls.random_seed = random_seed = 1 cls.model = guidedlda.GuidedLDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed)
def setUpClass(cls): test_dir = os.path.dirname(__file__) reuters_ldac_fn = os.path.join(test_dir, 'reuters.ldac') cls.dtm = dtm = guidedlda.utils.ldac2dtm(open(reuters_ldac_fn), offset=0) cls.n_iter = n_iter = 1 cls.n_topics = n_topics = 10 cls.random_seed = random_seed = 1 cls.model = model = guidedlda.GuidedLDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed) cls.doc_topic = model.fit_transform(dtm)
def test_lda_monotone(self): dtm = self.dtm model = self.model n_topics = self.n_topics random_seed = self.random_seed # fit model with additional iterations, verify improvement in log likelihood n_iter = self.n_iter * 2 model_new = guidedlda.GuidedLDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed) model_new.fit(dtm) self.assertGreater(model_new.loglikelihood(), model.loglikelihood())
def train_model(dataset, vocab, seed_topic_list, model_output_path, n_topics=5, n_top_words=10): """Function takes a dataset and creates a new model based on the privided input dataset Args: dataset (dtm): Dataset in dtm format as guided by guidedlda.datasets.* vocab (list(str)): Global vocab. seed_topic_list (array(list(str)): Seed topic keywords used for GuidedLDA. model_output_path (str): Path to the ooutput of trained model n_topics (int): Number of topics, default=5 n_top_words (int): Number of top words to be extracted for each topic. default=10 Returns: Creates and model and writes to model_output_path """ word2id = dict((v, idx) for idx, v in enumerate(vocab)) model = guidedlda.GuidedLDA(n_topics=n_topics, n_iter=100, random_state=7, refresh=20) seed_topics = {} for t_id, st in enumerate(seed_topic_list): for word in st: seed_topics[word2id[word]] = t_id model.fit(dataset, seed_topics=seed_topics, seed_confidence=0.25) #List the top words of each topic from the trained model. topic_word = model.topic_word_ for i, topic_dist in enumerate(topic_word): top_index = np.argsort(topic_dist)[:-(n_top_words + 1):-1] topic_words = np.array(vocab)[top_index] print('Topic [{}]: {}'.format(TOPIC_INDEX[i], ' '.join(topic_words))) print('\n') #Test on some files #doc_topic = model.transform(dataset) #for i in range(9): # print("Top topic: {} [Document Key words: '{}']".format(TOPIC_INDEX[doc_topic[i].argmax()], # ', '.join(np.array(vocab)[list(reversed(dataset[i,:].argsort()))[0:5]]))) # Dump the model for future production use. #model.purge_extra_matrices() with open('{}.pickle'.format(model_output_path), 'wb') as file_handle: pickle.dump(model, file_handle)
def optimize_lda(params, param_dict=param_dict, data_vect=data_vect, seed_topics=seed_topics, seed_confidence=seed_confidence, n_iter=n_iter_param, refresh=refresh_param, random_seed=random_seed, corpus=corpus, dictionary=dictionary, feature_names=feature_names, metric=metric_to_optimize): ''' Function to minimize in hyper parameter optimization metric: 'coherence_consistent', 'loglikelihood' ''' model_guidedlda = guidedlda.GuidedLDA(random_state=random_seed, n_iter=n_iter, refresh=refresh, **params, **param_dict) model_guidedlda.fit(X=data_vect, seed_topics=seed_topics, seed_confidence=seed_confidence) # to check whether model's params change every trials or not # print('alpha,', 'beta,', 'eta,', 'n_topics,', 'random_state') # print(model_guidedlda.alpha, model_guidedlda.beta, model_guidedlda.eta, model_guidedlda.n_topics, model_guidedlda.random_state) if metric == 'loglikelihood': metric_value = abs(model_guidedlda.loglikelihood()) elif metric == 'coherence_consistent': n_top_words = 20 topic_word = model_guidedlda.topic_word_ topics_lists = [] for i, topic_dist in enumerate(topic_word): topic_words = list( np.array(feature_names)[np.argsort(topic_dist)] [:-(n_top_words + 1):-1]) topics_lists.append(topic_words) coherence_model_lda = gensim.models.CoherenceModel( topics=topics_lists, corpus=corpus, dictionary=dictionary, coherence='u_mass') coherence_lda = coherence_model_lda.get_coherence() metric_value = abs(coherence_lda) else: raise ValueError('Choose the metric_to_optimize of the function') return metric_value
def do_seeded_lda(self, X, vocab, word2id, n_topics, n_iter, alpha, eta, random_state, seed_topic_list, seed_confidence): model = guidedlda.GuidedLDA(n_topics=n_topics, n_iter=n_iter, alpha=alpha, eta=eta, random_state=random_state, refresh=1) seed_topics = {} for t_id, st in enumerate(seed_topic_list): for word in st: seed_topics[word2id[word]] = t_id model.fit(X, seed_topics=seed_topics, seed_confidence=seed_confidence) n_top_words = 10 topic_word = model.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) return model.topic_word_
def test_lda_zero_iter(self): dtm = self.dtm model = self.model doc_topic = self.doc_topic n_topics = self.n_topics random_seed = self.random_seed # fit a new model with 0 iterations n_iter = 0 model_new = guidedlda.GuidedLDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed) doc_topic_new = model_new.fit_transform(dtm) self.assertIsNotNone(model_new) self.assertIsNotNone(doc_topic_new) self.assertLess(model_new.loglikelihood(), model.loglikelihood()) self.assertFalse((doc_topic_new == doc_topic).all())
def guided_analysis(X, word2id, vocab, topic_num, confidence, n_top_words=TOP_K_WORDS): """ Guided Analysis on the given dtm """ model = guidedlda.GuidedLDA(n_topics=topic_num, n_iter=ITERATION_NUMS, random_state=7, refresh=20) model.fit(X, seed_topics=load_seed_topics(word2id), seed_confidence=confidence) return model
def runGuidedLDA(doc_collection,no_topics,stop_words,seed_topics): print('Start SKLearnLDA...') tf_vec=CountVectorizer(max_df=0.95,min_df=2,max_features=no_features,stop_words=stop_words) termfreq=tf_vec.fit_transform(doc_collection) feature_names=tf_vec.get_feature_names() #Run LDA using scitkit learn print('Constructing GUIDED LDA model...') startlda=time.time() ldamodel=guidedlda.GuidedLDA(n_topics=no_topics, n_iter=100, random_state=7, refresh=20).fit(termfreq,seed_topics=seed_topics, seed_confidence=0.15) print('LDA Model Construction Took:'+str((time.time()-startlda)/60)+' minutes.') startldavecs=time.time() print('Constructing LDA vectors...') #ldavecs = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit_transform(termfreq,docidentifiers) ldavecs=ldamodel.transform(termfreq) print('LDA Vector Construction Took:'+str((time.time()-startldavecs)/60)+' minutes.') print('Completed SKLearnLDA!') return termfreq,ldamodel,ldavecs,feature_names
def test_lda_random_seed(self): dtm = self.dtm doc_topic = self.doc_topic n_iter = self.n_iter n_topics = self.n_topics random_seed = self.random_seed random_state = self.model.random_state # refit model with same random seed and verify results identical model_new = guidedlda.GuidedLDA(n_topics=n_topics, n_iter=n_iter, random_state=random_seed) rands_init = model_new._rands.copy() doc_topic_new = model_new.fit_transform(dtm) rands_fit = model_new._rands.copy() random_state_new = model_new.random_state np.testing.assert_array_equal(doc_topic_new, doc_topic) np.testing.assert_array_equal(random_state_new, random_state) # verify random variates are not changed np.testing.assert_array_equal(rands_init, rands_fit)
def build_model(data, num_topics, seed_topic_list, seed_conf, top_n=10, include_vis=True): #form bow matrix to feed as input into training guidedlda model data = [' '.join(text) for text in data] vectorizer = CountVectorizer() X = vectorizer.fit_transform(data).toarray() vocab = vectorizer.get_feature_names() word2id = dict((v, idx) for idx, v in enumerate(vocab)) #Creates dictionary that assigns words to topics via their #topic id given by the id2word assignment seed_topics = {} for topic_id, subset in enumerate(seed_topic_list): for word in subset: if word in word2id: seed_topics[word2id[word]] = topic_id # Build GuidedLDA model guidedlda_model = guidedlda.GuidedLDA(n_topics=num_topics, n_iter=100, random_state=7, refresh=20) guidedlda_model.fit(X, seed_topics=seed_topics, seed_confidence=seed_conf) top_vocab(guidedlda_model, vocab, top_n) # Saves model for production later with open('results/guided_lda/guided_lda_{}'.format(num_topics), 'wb') as f: pickle.dump(guidedlda_model, f) return guidedlda_model '''
vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True, ngram_range=(1, 3)) data_vectorized = vectorizer.fit_transform(train_clean_sentences) all_text = ' '.join(train_clean_sentences) tokens = word_tokenize(all_text) words = [w.lower() for w in tokens] vocab = sorted(set(words)) word2id = dict((v, idx) for idx, v in enumerate(vocab)) seed_topic_list = [[ 'carbon', 'pricing', 'greenhouse', 'backstop', 'infrastructure' ], ['mobility', 'transit', 'transportation'], ['social', 'exclusion', 'alone', 'friend', 'family']] model = guidedlda.GuidedLDA(n_topics=4, n_iter=100, refresh=20) seed_topics = {} for t_id, st in enumerate(seed_topic_list): for word in st: seed_topics[word2id[word]] = t_id model.fit(data_vectorized, seed_topics=seed_topics, seed_confidence=0.15) n_top_words = 10 topic_word = model.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) ## try Word2Vec gensim_list = []
docnames = list(docs.keys()) docnames = np.array(docnames) vocab = np.array(list(vocab)) vocab_sorter = np.argsort(vocab) print("done get data.") """ create en_gLDA pipeline """ print("Extracting tf features for gLDA...") tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english') tf = tf_vectorizer.fit_transform(data_samples) gLDA = guidedlda.GuidedLDA(n_topics=n_components, n_iter=100, random_state=7, refresh=20) gLDA.fit(tf) filename_glda = 'glda_model.sav' #gLDA = pickle.load(open(filename_glda, 'rb')) #gLDA.fit(tf) pickle.dump(gLDA, open(filename_glda, 'wb')) ## Use tf-idf features for SGD. #print("Extracting tf-idf features for SGD...") #tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, # max_features=n_features, # stop_words='english') #t0 = time() #tfidf = tfidf_vectorizer.fit_transform(data_samples)
def run(): # Read in data set df = pd.read_csv('Movie_Metadata_Sentiments.csv') # Subset only emotions required to get overall emotion detected from the text content sub_df = df[['anger', 'joy', 'fear', 'sadness']] df['Max'] = sub_df.idxmax(axis=1) # Split into train and test data set train, test = train_test_split(df, test_size=0.2, random_state=1) # Save to csv file df.to_csv('Movie_Metadata_Sentiments_Modified.csv', encoding='utf-8', header=True) test.to_csv('Movie_Metadata_Sentiments_Test.csv', encoding='utf-8', header=True) # Pre-process data to be fed into Guide LDA Model processed_docs = extract_data(train) dictionary = gensim.corpora.Dictionary(processed_docs) dictionary.filter_extremes(no_below=5, no_above=0.4, keep_n=100000) bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs] dict1 = dictionary.token2id X = get_term_matrix(processed_docs, dictionary) print("Guided LDA") emolex_df = pd.read_csv('NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', names=["word", "emotion", "association"], sep='\t') # Create seed list for each category anger_df = emolex_df[(emolex_df.association == 1) & (emolex_df.emotion == 'anger')].word anger_seed = [item for item in anger_df] joy_df = emolex_df[(emolex_df.association == 1) & (emolex_df.emotion == 'joy')].word joy_seed = [item for item in joy_df] fear_df = emolex_df[(emolex_df.association == 1) & (emolex_df.emotion == 'fear')].word fear_seed = [item for item in fear_df] sadness_df = emolex_df[(emolex_df.association == 1) & (emolex_df.emotion == 'sadness')].word sadness_seed = [item for item in sadness_df] # Append all topic list to be fed into model seed_topic_list = [anger_seed, joy_seed, fear_seed, sadness_seed] seed_topics = {} for t_id, st in enumerate(seed_topic_list): for word in st: try: seed_topics[dict1[word]] = t_id except: pass # Train the GuidedLDA model model = guidedlda.GuidedLDA(alpha=0.1, n_topics=4, n_iter=1000, random_state=7, refresh=20) model.fit(X, seed_topics=seed_topics, seed_confidence=0.20) # Check top n words in each topic (emotions) n_top_words = 15 topic_word = model.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array(list(dict1.keys()))[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) # Save the model to be imported into Telegram bot with open('Movie_Metadata_Sentiments_GLDA.model', 'wb') as file_handle: Pickle.dump(model, file_handle) # Save the Dictionary and Corpus name = "Movie_Metadata_Sentiments" dictionary.save("./dictionaries/" + name + ".dict") corpora.MmCorpus.serialize("./corpus/" + name + ".mm", bow_corpus) # Test the model on test set test_model()
def grid_search_lda_SED(texts, seed_topic_list, n_topics_range, priors_range, out_dir, n_top_words=20, seed_confidence=0.15, iterations=2000, save_doc_top=True, verbose=True): ''' Fit many topic models to pick the most tuned hyperparameters. Guidedlda version. Each fitted model is saved, filename being in the following format: {number of topics}T_{iteration rank}I_.{file extension} Parameters ---------- texts : iterable already preprocessed text data you want to build seeds on. seed_topic_list : list of lists list of words, where in seed_topic_list[x][y] x is a topic and y a word belonging in that topic. n_topics_range : iterable of int | int Number of topics to fit the model with. When fitting a single model, :int: is enough. Otherwise, input list of ints, a range, or other iterables. priors_range : list of tuples where every 1st element is alpha, every 2nd is eta. out_dir : str path to a directory, where results will be saved (in a child directory). n_top_words : int, optional (default: 20) when extracting top words associated with each topics, how many to pick? seed_confidence : float, optional (default: '0.15') When initializing the LDA, where are you on the spectrum of sampling from seeds (1), vs. sampling randomly (0)? iterations : int, optional (default: 2000) maximum number of iterations to fit a topic model with. save_doc_top : bool save documet-topic matices from models? verbose : bool, optional (default: True) print progress comments. Exports ------- out_dir/report_lines/* pickled dict with model information (n topics, model coherence, per-topic coherence, hyperparameters) out_dir/models/* gensim objects, where the model is saved. out_dir/plots/* pyLDAvis visualizations of the model ''' # INITIALIZATION # prepare foldrs make_folders(out_dir) # paths report_dir = os.path.join(out_dir, "report_lines", "") model_dir = os.path.join(out_dir, "models", "") plot_dir = os.path.join(out_dir, "plots", "") doctop_dir = os.path.join(out_dir, 'doctop_mats', '') # if a single model is to be fitted, # make sure it can be "iterated" if isinstance(n_topics_range, int): n_topics_range = [n_topics_range] # PREPARE DATA # for guidedlda fiting X, seed_priors, vectorizer = init_guidedlda( texts=texts, seed_topic_list=seed_topic_list, ) # for coherence counting bows, dictionary = gensim_format(texts) # TRAIN MODELS i = 0 for n_top in chain(n_topics_range): # iterate over priors for alpha_, eta_ in priors_range: # track time start_time = time() # track time # track iterations topic_fname = str(n_top) + "T_" alpha_fname = str(alpha_).replace('.', '') + 'A_' eta_fname = str(eta_).replace('.', '') + 'E_' # paths for saving filename = topic_fname + alpha_fname + eta_fname + 'seed' report_path = os.path.join(report_dir + filename + '.ndjson') model_path = os.path.join(model_dir + filename + '.joblib') pyldavis_path = os.path.join(plot_dir + filename + '_pyldavis.html') doctop_path = os.path.join(doctop_dir + filename + '_mat.ndjson') # train model model = guidedlda.GuidedLDA(n_topics=n_top, n_iter=iterations, alpha=alpha_, eta=eta_, random_state=7, refresh=10) # TODO: iterate seed_confidence? model.fit(X, seed_topics=seed_priors, seed_confidence=seed_confidence) # track time usage training_time = time() - start_time if verbose: print(' Time: {}'.format(training_time)) # save priors alpha = model.alpha eta = model.eta # extract topic words topics = [] for i, topic_dist in enumerate(model.topic_word_): topic_words = ( # take vocab (list of tokens in order) np.array(vectorizer.get_feature_names()) # take term-topic distribution (topic_dist), # where topic_dist[0] is probability of vocab[0] in that topic # and sort vocab in descending order [np.argsort(topic_dist)] # selected & reorder so that only words only n_top_words+1 are kept [:-(n_top_words + 1):-1]) # array to list topic_words = [word for word in topic_words] topics.append(topic_words) # calculate topic coherence based on the extracted topics coh_score, coh_topics = coherence_guidedlda(topics=topics, bows=bows, dictionary=dictionary, texts=texts) # save report report = (n_top, alpha, eta, training_time, coh_score, coh_topics) with open(report_path, 'w') as f: ndjson.dump(report, f) # save model dump(model, model_path) # produce a visualization nice = pyLDAvis.sklearn.prepare(model, X, vectorizer) pyLDAvis.save_html(nice, pyldavis_path) # save document-topic matrix if save_doc_top: doc_topic = (model.transform(X).tolist()) with open(doctop_path, 'w') as f: ndjson.dump(doc_topic, f) return None
def __init__(self, data, n_iter=500, eta=0.2, alpha=0.2, seed_confidence=10): # Generate sparse matrix representation of documents with stopwords removed. self._stopwords = text.ENGLISH_STOP_WORDS.union( ['appeared', '8217', '8230', '000']) self._vectoriser = CountVectorizer(stop_words=self._stopwords, max_features=1500) self._data = list(data.values) self._docs = self._vectoriser.fit_transform(self._data) self._features = self._vectoriser.get_feature_names() # Specify the guided topics self._seed_topics = [ ['btc', 'bitcoin', 'satoshi'], ['eth', 'ethereum', 'vitalik', 'foundation'], [ 'altcoin', 'altcoins', 'ltc', 'litecoin', 'xmr', 'monero', 'zec', 'zcash', 'etc', 'classic', 'xrp', 'ripple', 'trx', 'tron', 'ada', 'cardano', 'dash', 'digitalcash', 'xtz', 'tezoz', 'usdt', 'tether' ], ['mining', 'hashrate', 'hashing', 'pools', 'reward'], ['exchange', 'bitfinex', 'poloniex', 'binance'], ['market', 'markets', 'analysis', 'index', 'prices'], ['asia', 'china', 'korea', 'japan', 'hong', 'singapore', 'taiwan'], ['icos', 'ico', 'offering', 'token', 'tokens', 'raise', 'raised'], ['regulation', 'legal', 'law', 'tax', 'taxes'], ['blockchain', 'protocol', 'scaling'], ['bull', 'bear', 'bullish', 'rally', 'bearish', 'trading'], ['technology', 'tech'], [ 'ledger', 'trezor', 'keepkey', 'coinomi', 'jaxx', 'myetherwallet' ], [ 'fiat', 'reserve', 'gold', 'bank', 'dollar', 'pound', 'euro', 'yen' ], [ 'business', 'investor', 'investors', 'revenue', 'enterprise', 'commerce' ], ['commodity', 'oil', 'oil-backed'], ['sponsored', 'press', 'release'], ['theft', 'stolen', 'scam', 'criminal'] ] self.topic_names = [ 'btc', 'eth', 'altcoins', 'mining', 'exchange', 'market', 'asia', 'ico', 'regulation', 'blockchain', 'trading', 'technology', 'wallet', 'fiat', 'business', 'commodity', 'sponsored', 'criminal' ] self._n_topics = len(self.topic_names) # Define LDA model parameters self.seed_confidence = seed_confidence self._model = guidedlda.GuidedLDA(self._n_topics, n_iter=n_iter, alpha=alpha, eta=eta)
def load_glda_topic_model(self): print('Loading topics...') self.glda = guidedlda.GuidedLDA(n_topics=1) self.glda = joblib.load('glda_topic_model.lda') self.glda_vectorizer = self.load_glda_term_vector_topic_model() self.glda_tf_feature_names = self.glda_vectorizer.get_feature_names()
def build_lda(self, corpus, n_topics=10): n_components = n_topics n_top_words = 30 docs = corpus # print("Stemming...") # stemmer = CorpusStemmer() # docs = stemmer.transform(docs) # print("DOC tagging...") # tagger = CorpusPOSTagger() # tagged_docs = tagger.transform(docs) # # tag_constraints = [] # # # build tags based on Singular Noun, Noun and Adjetive, Noun # label_tags = ['NN,NN', 'JJ,NN', 'NNS,NN' ] # for tags in label_tags: # tag_constraints.append(tuple(map(lambda t: t.strip(), # tags.split(',')))) # # cand_labels = self.find_labels(n_labels, label_min_df, tag_constraints, tagged_docs, n_cand_labels, docs) # # print("Collected {} candidate labels".format(len(cand_labels))) # # print("Calculate the PMI scores...") # # pmi_cal = PMICalculator( # doc2word_vectorizer=CountVectorizer( # max_df=.95, # min_df=5, # lowercase=True, # token_pattern= r'\b[a-zA-Z]{3,}\b', # stop_words=self.load_stopwords() # ), # doc2label_vectorizer=LabelCountVectorizer()) # # pmi_w2l = pmi_cal.from_texts(docs, cand_labels) # bigram = gensim.models.Phrases(docs) print('Building BiGrams from the corpus...') texts = [bigram[line] for line in docs] stop = list(self.load_stopwords()) stop.append('off') stop.append('http') stop.append('www') stop.append('edt') stop.append('est') stop.append('mdt') stop.append('pst') stop.append('pt') tf_vectorizer = LemmaCountVectorizer(max_df=.95, min_df=2, lowercase=True, token_pattern=r'\b[a-zA-Z]{3,}\b', stop_words=stop) print("Building the Vectorizer for Topic model...") tf = tf_vectorizer.fit_transform( map(lambda sent: ' '.join(sent), texts)) self.tf_feature_names = tf_vectorizer.get_feature_names() # Save vectorizer to disk as it is needed for service topic-extraction self.save_term_vector_topic_model(tf_vectorizer) self.save_fitted_term_vector(tf) # test semi-supervided Topics print("Building the Alternative LDA Topic model...") vocab = tuple(self.tf_feature_names) seed_topic_list = {} word2id = {} if seed_topic_list.items().__sizeof__() > 0: word2id = dict( (v, idx) for idx, v in enumerate(tuple(self.tf_feature_names))) # array_tf = tf.toarray() # Guided LDA with seed topics. # seed_topic_list = {'percent': 7, 'year': 7, 'month': 7, 'quarter': 7} seed_topics = {} for term, topic in seed_topic_list.items(): seed_topics[word2id[term]] = topic model = guidedlda.GuidedLDA(n_topics=n_components, n_iter=100, random_state=7, refresh=50) logger.propagate = False model.fit(tf, seed_topics=seed_topics, seed_confidence=0.15) # model.fit(array_tf) self.lda = model print("Printing the Alternative LDA Topic model...") topic_word = model.topic_word_ n_top_words = 30 # print topics with words and score rank for i, topic_dist in enumerate(topic_word): # topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] # print('Topic {}: {}'.format(i, ' '.join(topic_words))) topic_ = topic_dist topic_ = topic_ / topic_.sum( ) # normalize to probability distribution bestn = matutils.argsort(topic_, 30, reverse=True) topic_ = [(self.tf_feature_names[id], topic_[id]) for id in bestn] topic_ = ' + '.join(['%.3f*"%s"' % (v, k) for k, v in topic_]) print("Topic#", i, ":", topic_) print("Saving the LDA Topic model to .lda ...") self.save_lda_topic_model() print("Building the LDA Visualization for Topic model...") py_glda_vis = MyDisplay() list_topic_names = [ 'T00', 'T01', 'T02', 'T03', 'T04', 'T05', 'T06', 'T07', 'T08', 'T09', 'T10', 'T11', 'T12', 'T13', 'T14', 'T15', 'T16', 'T17', 'T18', 'T19', 'T20', 'T21', 'T22', 'T23', 'T24', 'T25', 'T26', 'T27', 'T28', 'T29', 'T30', 'T31', 'T32', 'T33', 'T34', 'T35', 'T36', 'T37', 'T38', 'T39' ] list_topic_labels = [] visualization = py_glda_vis.prepare_glda(model, tf, tf_vectorizer, mds='tsne') # processing the labels in the same order of topic relevance print(list(visualization[6:])[0]) for i, topic in enumerate(list(visualization[6:])[0]): list_topic_labels.append(list_topic_names[topic - 1]) topic_name = {"topic.names": list_topic_labels} print(topic_name) # visualization_html = py_lda_vis.prepared_data_to_html(visualization, # json_names=topic_name) py_glda_vis.save_html(visualization, 'GLDA_Visualization_labels.html', json_names=topic_name) py_glda_vis.save_html(visualization, 'GLDA_Visualization_nolabels.html') # this example is to be used for future Topic Model builder # try: # self.load_lda_topic_model() # except FileNotFoundError: # if no lda topic_model file, build a new one # self.encoded_html = base64.b64encode(visualization_html.encode()) self.encoded_html = '' self.topics = {}
def home(): return "Hello, World!" # return a string data = pd.read_csv('text data.csv') texts = data['Article'] labels = data['Class'] profession = data['Profession'] stop = stopwords.words('english') stemmer = SnowballStemmer("english") preprocess = data['Article'].apply( lambda x: [item for item in x if item not in stop]) preprocess = data["Article"].apply(lambda x: [stemmer.stem(y) for y in x]) preprocess = data['Article'].str.replace("Context\n", " ") preprocess = data['Article'].str.replace("Context:", " ") preprocess = data['Article'].str.replace("CONTEXT:", " ") preprocess = data['Article'].str.replace("Context:", " ") preprocess = data['Article'].str.replace("Context.", " ") path = 'record.txt' with io.open(path, encoding='utf-8') as f: text = f.read().lower() print('corpus length:', len(text)) chars = sorted(list(set(text))) print('total chars:', len(chars)) char_indices = dict((c, i) for i, c in enumerate(chars)) indices_char = dict((i, c) for i, c in enumerate(chars)) # cut the text in semi-redundant sequences of maxlen characters maxlen = 40 step = 3 sentences = [] next_chars = [] for i in range(0, len(text) - maxlen, step): sentences.append(text[i:i + maxlen]) next_chars.append(text[i + maxlen]) print('Vectorization...') y = np.zeros((len(sentences), len(chars)), dtype=np.bool) for i, sentence in enumerate(sentences): y[i, char_indices[next_chars[i]]] = 1 vocab = guidedlda.datasets.load_vocab(guidedlda.datasets.REUTERS) # Guided LDA Implementation model = guidedlda.GuidedLDA(n_topics=52, n_iter=100, random_state=7, refresh=20) model.fit(y) topic_word = model.topic_word_ n_top_words = 8 for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) dataSet = pd.read_csv("new_topics.csv") def text_process(mess): nopunc = [char for char in mess if char not in string.punctuation] nopunc = ''.join(nopunc) return [ word for word in nopunc.split() if word.lower() not in stopwords.words('english') ] pipeline = Pipeline([ ('bow', CountVectorizer(analyzer=text_process)), ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier ]) pipeline.fit(dataSet["Topics"], dataSet["Profession"]) a = dataSet["Topics"] prediction = pipeline.predict(a) print(classification_report(dataSet["Profession"], prediction)) b = ["Data"] prediction = pipeline.predict(b) print("Predicted Profession:", prediction)
def test_lda_loglikelihoods(self): X = np.array([[1, 1], [2, 1], [3, 1], [4, 1], [5, 8], [6, 1]]) model = guidedlda.GuidedLDA(n_topics=2, n_iter=100, random_state=1) model.fit(X) self.assertGreater(len(model.loglikelihoods_), 1)
get_lda_summary(5, 10, climate['text_processed'], 'climate_lda') get_lda_summary(5, 10, yv['text_processed'], 'yv_lda') get_lda_summary(5, 10, hk['text_processed'], 'hk_lda') get_lda_summary(5, 10, usa['text_processed'], 'usa_lda') ########## ## guided lda ########## import guidedlda vocab = count_vectorizer.get_feature_names() word2id = dict((v, idx) for idx, v in enumerate(vocab)) seed_topic_list = [['π’'], ['π‘', 'π '], ['π', 'βΊοΈ'], ['π€£', 'π']] model = guidedlda.GuidedLDA(n_topics=4, n_iter=100, random_state=7, refresh=20) seed_topics = {} for t_id, st in enumerate(seed_topic_list): for word in st: seed_topics[word2id[word]] = t_id model.fit(count_data, seed_topics=seed_topics, seed_confidence=0.15) n_top_words = 10 topic_word = model.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words)))
def test_lda_constructor(self): n_topics = 10 model1 = guidedlda.GuidedLDA(n_topics) self.assertIsNotNone(model1) model2 = guidedlda.GuidedLDA(n_topics=n_topics) self.assertIsNotNone(model2)