def train_model(self, X, words, anchors, anchor_strength=3): print("trainning model", end="\r") # Train the first layer model = ct.Corex(n_hidden=20, seed=8) model = model.fit( X, words=words, anchors=anchors, # Pass the anchors in here anchor_strength=anchor_strength, # Tell the model how much it should rely on the anchors ) return model # TODO: Train successive layers tm_layer2 = ct.Corex(n_hidden=10, seed=16) tm_layer2.fit(model.labels) tm_layer3 = ct.Corex(n_hidden=9) tm_layer3.fit( tm_layer2.labels, words=words, anchors=anchors, # Pass the anchors in here anchor_strength=anchor_strength, # Tell the model how much it should rely on the anchors verbose=1, max_iter=300, ) print("finished") return tm_layer3
def get_topic(anchor_words): topic_model = ct.Corex(n_hidden=4, seed=0) topic_model.fit(doc_word, words=words, anchors=anchor_words, anchor_strength=1000) print(topic_model.get_topics(topic=0, n_words=10))
def __init__(self, text, anchors=anchor_words, n_topic=25, max_features=20000, max_iter=200, seed=100, anchor_strength=10): """ Initialize and train the CorEx model :param text: A text series of customer service transcripts :param n_topic: number of topics :param max_features: maximum features for word embedding :param max_iter: maximum iteration time :param seed: random state :param anchor_strength: weight for anchor word """ self.text = text self.n_topic = n_topic cv = CountVectorizer(stop_words='english', max_features=max_features, binary=True) # Corpus corpus = cv.fit_transform(text) # Vocabulary words = list(np.asarray(cv.get_feature_names())) # Build model self.topic_model = ct.Corex(n_hidden=n_topic, max_iter=max_iter, seed=seed) self.topic_model.fit(corpus, words=words, anchors=anchors, anchor_strength=anchor_strength) self.topics = self.topic_model.get_topics()
def sample_add(X, sc1, X_sub2, vocab, anchor, strength, enjeux_list, num_average=20): """ Pour ajouter des samples sur la base du delta des scores. Renvoie True si le F1 est amélioré. """ deltamoy = [0, 0, 0, 0] for it in range(num_average): model2 = ct.Corex(n_hidden=len(enjeux_list)) model2.fit(X_sub2, words=vocab, anchors=anchor, anchor_strength=strength) #Prédiction et évaluation sur sur toutes les données test = model2.predict(X) sc2 = evaluate(test, returnscore=True) #Calcul du delta des évaluations deltamoy = vadd(deltamoy, delta(sc1, sc2, returnmoy=True)) deltamoy = np.array(deltamoy) / num_average if deltamoy[3] > 0: return (True, sc2) else: return (False, sc1)
def __init__(self, train_data, n_topics, model_choice='CorEx'): ''' Description : Class constructor Parameters : - train_data (List[List[Float]]) : the output probabilities for (a) document(s) to belong to each topic. This will be the output of CorExModel.transform() for the whole corpus or CorExModel.predict_proba() for a unique entry. - model_choice (String) : the model we want to use for the unsupervised classifier. You can either choose between the use of : + a Hierarchical Corex Model which will use the output of the first model of CorExModel.tune() (high dimension) as an input and will output a multi-label classification array in lower dimension. + a simple KMeans (for now) which will use the output of a first CorExModel.tune() (high dimension) and create clusters based on the vectorized version of each description in the topic membership probabilities space. ''' self.train_data = train_data if model_choice not in ['CorEx', 'KMeans']: raise TypeError( "Wrong model choice, please choose between 'CorEx' and 'KMeans'." ) elif model_choice == 'CorEx': self.model = ct.Corex(n_hidden=n_topics) elif model_choice == 'KMeans': self.model = KMeans(n_clusters=n_topics, random_state=42)
def train_model(self): log.info('Running model training...') # Train the CorEx topic model with 50 topics topic_model = ct.Corex(n_hidden=self.n_topic, words=self.words, max_iter=200, verbose=False, seed=1) topic_model.fit(self.doc_words, words=self.words) # save to class self.topic_model = topic_model if self.print_words: self.print_topic_words(topic_model=topic_model)
def __init__(self, config, preprocessor, load=False, seed=True): self.model_path = config.paths.save_model_path if load: self.model = pickle.load(open(self.model_path, "rb")) else: self.model = ct.Corex(n_hidden=config.model.num_topics, seed=config.model.random_state) self.vocab = preprocessor.vocab self.seed_topics = None if seed: self.seed_topics = preprocessor.seed_topics
def train_model(self): log.info('Running model training...') """ Train semisupervised topic model with n topics""" # set anchor words self.anchor_dict = set_anchor_words(anchor_path=self.anchor_path) anchor_words = list(self.anchor_dict.values()) # train model topic_model = ct.Corex(n_hidden=self.n_topic, words=self.words, max_iter=200, verbose=False, seed=1) topic_model.fit(self.doc_words, words=self.words, anchors=anchor_words, anchor_strength=self.anchor_strength) # save to class self.topic_model = topic_model if self.print_words: self.print_topic_words(topic_model=topic_model)
def __init__(self): self.vectorizer = CountVectorizer(stop_words='english', max_features=20000, binary=True) filenames = glob.glob('data/reddit/*_comments_*.json.gz') filename = filenames[0] input_data: pd.DataFrame = data_source.load_from_file( filename) #HACK: speedup # Each "Document" is a text comment self.doc_word = self.vectorizer.fit_transform(input_data.text) self.doc_word = ss.csr_matrix(self.doc_word) sub_name = os.path.basename(filename).split('_')[0] print(self.doc_word.shape) # n_docs x m_words # Get words that label the columns # Encode/decode to get rid of annoying unicode characters that break the topic_model obj when saving/loading words = [ x for x in list(np.asarray(self.vectorizer.get_feature_names())) ] topic_model_filename = f'data/models/{sub_name}_topic_model.pkl' if os.path.exists(topic_model_filename): topic_model = cPickle.load(open(topic_model_filename, 'rb')) else: # Train the CorEx topic model, with some forum-specific anchor words topic_model = ct.Corex( n_hidden=25, anchors=[['xmr', 'monero'], ['btc', 'bitcoin', 'satoshi', 'nakamoto'], ['stellar', 'xlm'], ['ltc', 'litecoin'], ['xrp', 'ripple'], ['eth', 'ethereum', 'vitalik'], ['binance', 'coinbase', 'exchange'], ['electrum', 'wallet']], anchor_strength=4) # Define the number of latent (hidden) topics to use. topic_model.fit(self.doc_word, words=words) # , cPickle.dump(topic_model, open(topic_model_filename, 'wb')) # topic_model.save(topic_model_filename, ensure_compatibility=False) self.topic_model = topic_model # Print all topics from the model topics = topic_model.get_topics() for n, topic in enumerate(topics): topic_words, _ = zip(*topic) print('{}: '.format(n) + ','.join(topic_words))
def __init__(self, corpus, n_topics, stem=False, anchors=None, anchor_strength=None, process=False, verbose=False): ''' Description: Class constructor Parameters: - corpus (List[String]) : the list of raw descriptions. - n_topics (Integer) : the default number of topics you're looking for. The parameter could be changing with the @tune() method. - anchors (List[String] | List[List[String]]) : chosen anchors for the CorEx models. Anchors should be specific to destinations or at least to clusters of destinations. - anchor_strength (Integer) : the weigth given to anchors. - process (String) : specify if the corpus need to be processed using @process method. - verbose (Boolean) : specify if we want information about the model while it is training. - stem (Boolean) : specify if we want to use stemming in the processing step ''' self.corpus = corpus self.n_topics = n_topics self.model = ct.Corex(n_hidden=self.n_topics, anchors=anchors, anchor_strength=anchor_strength, verbose=verbose, process=process, seed=42) self.is_fitted = False if process: self.train_data = self.process_corpus(stem=stem) else: self.train_data = self.corpus self.vectorizer = TfidfVectorizer(max_df=.7, min_df=.01, max_features=None, ngram_range=(1, 2), norm=None, binary=True, use_idf=True, sublinear_tf=False)
def train(self, df, n_hidden=8, anchors=[["oil", "gas"]], anchor_strength=3): print('anchor_strength ', str(anchor_strength), ' n_hidden ', str(n_hidden)) vectorizer = TfidfVectorizer(max_df=.5, min_df=10, max_features=None, ngram_range=(1, 2), norm=None, binary=True, use_idf=False, sublinear_tf=False, stop_words='english') vectorizer = vectorizer.fit(df['body']) tfidf = vectorizer.transform(df['body']) vocab = vectorizer.get_feature_names() # Anchors designed to nudge the model towards measuring specific genres anchors = [[a for a in topic if a in vocab] for topic in anchors] model = ct.Corex(n_hidden=n_hidden, seed=42) model = model.fit( tfidf, words=vocab, anchors=anchors, # Pass the anchors in here anchor_strength= anchor_strength # Tell the model how much it should rely on the anchors ) topic_hash = {} for i, topic_ngrams in enumerate(model.get_topics(n_words=10)): topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0] topic_hash[i] = topic_ngrams self._vectorizer = vectorizer self._model = model self._topic_hash = topic_hash self._oil_and_gas_topic_num = [ topic_num for topic_num, topic_ngrams in self._topic_hash.items() if ('oil' in topic_ngrams) or ('gas' in topic_ngrams) ]
def get_corex_topics(num_topics_list, docs, features, print_flag = False): """ outputs correlation list for model selection """ total_corr = [] for i in num_topics_list: topic_model = ct.Corex(n_hidden=i, seed = 10) topic_model.fit(docs, words=features) total_corr.append(topic_model.tc) if print_flag == True: topics = topic_model.get_topics() print('Num topics: ', i) for topic_n, topic in enumerate(topics): words,mis = zip(*topic) topic_str = str(topic_n+1)+': '+', '.join(words) print(topic_str) print('') return total_corr
def train_Corex(self,n_topics,data_df): #self.countvectorizer=countvectorizer self.data_df=data_df text_corpus=self.data_df[self.sentence_col].values.tolist() print(len(text_corpus)) if len(text_corpus)==0: raise PipelineError('Please provide text corpus', 'This object provides advanced corex vectors.') if self.countvectorizer: doc_word=self.countvectorizer.transform(text_corpus) else: countvectorizer_obj=ClassicVectorizationTrain(countvectorizer=1) self.countvectorizer=countvectorizer_obj.get_countVectorizer(text_corpus=text_corpus) doc_word=self.countvectorizer.transform(text_corpus) doc_word=csr_matrix(doc_word) words = list(np.asarray(self.countvectorizer.get_feature_names())) not_digit_inds = [ind for ind,word in enumerate(words) if not word.isdigit()] doc_word = doc_word[:,not_digit_inds] words = [word for ind,word in enumerate(words) if not word.isdigit()] # Train the CorEx topic model with 50 topics self.corex = ct.Corex(n_hidden=n_topics, words=words, max_iter=200, verbose=False, seed=1) self.corex.fit(doc_word, words=words) return dict(corex_model=self.corex, countvectorizer= self.countvectorizer)
def fit_topics( dataset_label, doc_vectors, feature_names, titles, n_topics, anchors, anchor_strength=10, max_iter=25, ): """Apply Corex topic modelling to a set of document vectors, and save the model and output to disk. Args: dataset_label(str): Name of this dataset, for labelling the output files doc_vectors(np.array): Count (or equivalent) vector repr of the documents. feature_names(list): Names of each feature in the doc_vectors titles(list): Name of each document in doc_vectors n_topics(int): Number of topics for Corex to generate anchors(list of list): Corex anchor terms anchor_strength(int, optional): Corex anchor strength multiplier. Defaults to 10. max_iter(int, optional): Number of model iterations. Defaults to 25. Returns: topic_model: trained Corex topic model """ topic_model = ct.Corex(max_iter=max_iter, n_hidden=n_topics) topic_model.fit( X=doc_vectors, words=feature_names, docs=titles, anchors=anchors, anchor_strength=anchor_strength, ) # Use Corex tools for writing the data to the local directory label = make_model_label(dataset_label, n_topics, max_iter) vt.vis_rep(topic_model, column_label=feature_names, prefix=label) return topic_model
def params_search(fit_text, num_max_df, num_min_df, num_topics): ''' ''' # Model Parameters vectorizer = TfidfVectorizer(strip_accents='ascii', encoding='unicode', max_df=num_max_df, min_df=num_min_df, max_features=None, ngram_range=(1, 2), norm=None, binary=True, use_idf=False, sublinear_tf=False) model = ct.Corex(n_hidden=num_topics, seed=42) # vectorizer vect_fit = vectorizer.fit(fit_text) tfidf = vectorizer.transform(fit_text) vocab = vect_fit.get_feature_names() anchors = [] model = model.fit(tfidf, words=vocab) vt.vis_rep(model, column_label=vocab, prefix='./corex_models/{}-topic-model'.format(num_topics)) model_tc = model.tc vect_print = 'Vect params: min_df={}, max_df={}'.format( num_min_df, num_max_df) corex_print = 'CorEx params: n_t={}, tc={}'.format(num_topics, model_tc) return vect_print + ' ' + corex_print
# print(params_search(all_tweets['text'], x1, num_min_df, num_topics)) # Model Parameters vectorizer = TfidfVectorizer(strip_accents='ascii', encoding='unicode', max_df=num_max_df, min_df=num_min_df, max_features=None, ngram_range=(1, 2), norm=None, binary=True, use_idf=False, sublinear_tf=False, stop_words='english') model = ct.Corex(n_hidden=num_topics, seed=42) anchors = [ 'trump', ['win', 'giveaway'], 'vote', ['sanders', 'warren', 'biden', 'democratic', 'buttigieg'], ['kobe', 'bryant'], 'book', ['super', 'bowl'], ['climate', 'change'], ['podcast', 'episode'], ['mental', 'health'], ['health', 'care'], 'coronavirus', 'australia', 'jesus', ['sexually', 'assaulted'], ['social', 'justice'], 'brexit', ['black', 'history'], ['conspiracy', 'theories'], ['trans', 'people'], 'song', ['movie', 'film'], 'food', 'drink' ] do_vect = True if do_vect: print('Vectorizing tweets...', end='')
def run_CorEx(documents, anchorList, n_topics, n_words_per_topic): """ Performs CorEx on corpus documents using anchorList. Returns topics as strings in topicList. """ # CorEx uses an TF-IDF vectorization vectorizer = TfidfVectorizer(max_df=.5, min_df=10, max_features=None, ## ngram_range=(1, 2), for bi-grams ## ngram_range=(1,3), for bi-grams and tri-grams ngram_range=(1,1), # for no bi-grams or tri-grams norm=None, binary=True, use_idf=False, sublinear_tf=False ) # Fit chat corpus to TF-IDF vectorization vectorizer = vectorizer.fit(documents) tfidf = vectorizer.transform(documents) vocab = vectorizer.get_feature_names() # Apply CorEx with no anchors for a comparison anchors = [] model = ct.Corex(n_hidden=n_topics, seed=42) # n_hidden specifies the # of topics model = model.fit(tfidf, words=vocab) # Display and write to file the results of CorEx with no anchors fileName = "CorEx_no_anchors_"+str(n_topics)+"topoics_"+str(n_words_per_topic)+"words.txt" outputFile = open(fileName, 'w') outputFile.write("File: " + fileName +"\n\n") print("\nCorEx Topics with no anchors:") for i, topic_ngrams in enumerate(model.get_topics(n_words=n_words_per_topic)): topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0] print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams))) outputFile.write("{}".format(" ".join(topic_ngrams))+"\n") outputFile.close() ## remove anchor words that are not in the chat corpus anchors = [ [a for a in topic if a in vocab] for topic in anchorList ] model = ct.Corex(n_hidden=n_topics, seed=42) model = model.fit( tfidf, words=vocab, anchors=anchors, # Pass the anchors in here anchor_strength=3 # Tell the model how much it should rely on the anchors ) # Display and write to file the results of CorEx with no anchors fileName = "CorEx_anchors_"+str(len(anchors))+"_"+str(n_topics) \ +"topoics_"+str(n_words_per_topic)+"words.txt" outputFile = open(fileName, 'w') outputFile.write("File: " + fileName +"\n\n") topicList = [] print("\nCorEx Topics with anchors:") for i, topic_ngrams in enumerate(model.get_topics(n_words=n_words_per_topic)): topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0] topicList.append(" ".join(topic_ngrams)) print("Topic #{}: {}".format(i+1, ", ".join(topic_ngrams))) outputFile.write("{}".format(" ".join(topic_ngrams))+"\n") outputFile.close() return topicList
#%% obj2 = instance.optimize_selectivity(bnds=(0.1, 0.9)) prediction = instance.predict(instance.X, selectivity=obj2.x) sc2 = evaluate(docs_df, prediction, returnscore=True) #%% prediction = instance.predict(instance.X) sc1 = evaluate(docs_df, prediction, returnscore=True) #delta(sc1,sc2,returnmoy=True) #%% #On va utiliser ce modèle comme modèle initial (vérité # approximative pour raffiner le tout) k = 2 topic_model = ct.Corex(n_hidden=len(enjeux_list)) topic_model.fit(instance.X, words=instance.vocab, anchors=instance.thesau_list, anchor_strength=k) mat = topic_model.labels sc2 = evaluate(docs_df, mat, returnscore=True) delta(sc1, sc2, returnmoy=True) #%% #On va essayer d'optimiser en faisant du stratified sampling y_true, X_sub, y_pred = separate(instance.docs, instance.X, prediction=instance.predict(instance.X)) from sklearn.metrics import label_ranking_loss
def predict_final(): t1 = [request.args.get('topic10'), request.args.get('topic11'), request.args.get('topic12')] t1 = [s.lower() for s in t1 if s] t2 = [request.args.get('topic20'), request.args.get('topic21'), request.args.get('topic22')] t2 = [s.lower() for s in t2 if s] t3 = [request.args.get('topic30'), request.args.get('topic31'), request.args.get('topic32')] t3 = [s.lower() for s in t3 if s] t4 = [request.args.get('topic40'), request.args.get('topic41'), request.args.get('topic42')] t4 = [s.lower() for s in t4 if s] t5 = [request.args.get('topic50'), request.args.get('topic51'), request.args.get('topic52')] t5 = [s.lower() for s in t5 if s] anchors = [t1, t2, t3, t4, t5] infile = open('stopwords_final', 'rb') stopwords = pickle.load(infile) infile.close() df = pd.read_pickle('cm_19_06') df.drop_duplicates(subset='body', keep=False, inplace=True) df = df[df['author'] != 'Ilackfocus'] df = df[df['author'] != '[deleted]'] df = df[df['author'] != 'AutoModerator'] token_pattern_no_number = u'(?ui)\\b\\w*[a-zA-Z]+\\w*\\b' vectorizer_corex = CountVectorizer(stop_words=stopwords, binary=True, token_pattern=token_pattern_no_number, ngram_range=(1, 2), max_df=0.5, min_df=2, max_features=20000) c_word = vectorizer_corex.fit_transform(df['body']) vocab = vectorizer_corex.get_feature_names() ct_model = ct.Corex(n_hidden=5, seed=42) c_model_fitted = ct_model.fit(c_word, words=vocab, anchors=anchors, anchor_strength=4) topic_dist = [] topic_count = np.asarray(c_model_fitted.labels).sum(axis=0) for i, topic_ngrams in enumerate(topic_count): topic_dist.append(round((topic_count[i] / len(c_model_fitted.labels)) * 100, 2)) wd = [] for i, topic_ngrams in enumerate(c_model_fitted.get_topics(n_words=10)): wd.append([ngram[0] for ngram in topic_ngrams if ngram[1] > 0]) top_df = pd.DataFrame(data={'Topic': [1, 2, 3, 4, 5], '% of all comments': topic_dist}) top_df['Keywords'] = pd.Series(anchors) top_df['Top Words'] = pd.Series(wd) top_df.set_index('Topic', inplace=True) df['Topic1'] = c_model_fitted.labels[:, 0] df['Topic2'] = c_model_fitted.labels[:, 1] df['Topic3'] = c_model_fitted.labels[:, 2] df['Topic4'] = c_model_fitted.labels[:, 3] df['Topic5'] = c_model_fitted.labels[:, 4] c1 = df[df['Topic1'] == True][['created_utc', 'author', 'score', 'body', 'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']].sample(5) c2 = df[df['Topic2'] == True][['created_utc', 'author', 'score', 'body', 'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']].sample(5) c3 = df[df['Topic3'] == True][['created_utc', 'author', 'score', 'body', 'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']].sample(5) c4 = df[df['Topic4'] == True][['created_utc', 'author', 'score', 'body', 'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']].sample(5) c5 = df[df['Topic5'] == True][['created_utc', 'author', 'score', 'body', 'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']].sample(5) comment_df = pd.concat([c1, c2, c3, c4, c5], ignore_index=True) comment_df['date'] = pd.to_datetime(comment_df['created_utc'], unit='s').dt.strftime('%m/%d/%Y') comment_df = comment_df[['date', 'author', 'score', 'body', 'Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']] return flask.render_template('predict_final.html', table1=[top_df.to_html(table_id='cm')], table2=[comment_df.to_html(table_id='sc')])
# Check if topic word exists for topic in topic_list: anchor = [] topic_words = topic.split('\n') for topic_word in topic_words: if not topic_word.strip() in features_df['KeywordLabel'].tolist(): pdb.set_trace() else: anchor.append(topic_word.strip()) anchors.append(anchor) # Sparse matrices are also supported X = ss.csr_matrix(data) # Train the CorEx topic model topic_model = ct.Corex(n_hidden=len( anchors)) # Define the number of latent (hidden) topics to use. topic_model.fit(X, docs=segment_df.updated_id.tolist(), words=features, anchors=anchors, anchor_strength=10) topics = topic_model.get_topics() for topic_n, topic in enumerate(topics): words, mis = zip(*topic) topic_str = str(topic_n) + ': ' + ','.join(words) print(topic_str) top_docs = topic_model.get_top_docs()
if Case == '1': cut_case = input('1.jieba or 2.monpa') words_list, Label = cut(rows, stop_word_list, cut_case) cPickle.dump(words_list, open('words_list_' + cut_case + '.pkl', 'wb')) elif Case == '2': words_list = cPickle.load(open('words_list_2.pkl', 'rb')) anchor = get_anchor() for j in range(100): vectorizer = CountVectorizer( token_pattern='\\b\\w+\\b') # 原本只使用2個字以上的詞,改為1個字即可使用 X = vectorizer.fit_transform(words_list) words = list(np.asarray(vectorizer.get_feature_names())) topic_model = ct.Corex(n_hidden=N_topic, words=words, seed=3) # topic_model = cPickle.load(open('model_monpa.pkl', 'rb')) if j > 0: for j in range(N_topic): anchor_words = input( 'input topic_%s\'s anchor words(split with space):\n' % str(j + 1)) t = anchor_words.split() for word in t: anchor[j].append(word) print('本次新增anchor', t) print(anchor) topic_model.fit(X, words=words, anchors=anchor, anchor_strength=4) # anchors目前是自己設定 # cPickle.dump(topic_model, open('model.pkl', 'wb'))
acsdata = read_articles("txt-files/research") data_words = [] for file in acsfiles: data_words.append(cleanText(file)) data = [' '.join(words) for words in data_words] data_train, data_test, idx_train, idx_test = train_test_split(data, range(len(data)), \ test_size=0.2, random_state=0) id2word = corpora.Dictionary(data_words) texts = data_words corpus = [id2word.doc2bow(text) for text in texts] vectorizer = CountVectorizer(stop_words='english', binary=True) doc_word = vectorizer.fit_transform(data_train) doc_word = ss.csr_matrix(doc_word) words = list(np.asarray(vectorizer.get_feature_names())) topic_model = ct.Corex(n_hidden= 25, words=words, max_iter=200, verbose=False)#, seed=1) topic_model.fit(doc_word, words=words); coherence = c_v_topic_coherence(topic_model, corpus = corpus, texts = texts, dictionary = id2word, topn = 20) print('Coherence Score: ', coherence) topics = topic_model.get_topics() for n,topic in enumerate(topics): topic_words,_ = zip(*topic) print('{}: '.format(n) + ', '.join(topic_words))
def fit(self, df=None, **kwargs): """ Fits a model using the method specified when initializing the ``TopicModel``. Details on model-specific \ parameters are below: **sklearn_lda** Fits a model using :py:class:`sklearn.decomposition.LatentDirichletAllocation`. For more information on \ available parameters, please refer to the official documentation: \ https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param alpha: Represents document-topic density. When values are higher, documents will be comprised of more \ topics; when values are lower, documents will be primarily comprised of only a few topics. This parameter is \ used instead of the doc_topic_prior sklearn parameter, and will be passed along to sklearn using the formula: \ ``doc_topic_prior = alpha / num_topics`` :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \ when values are lower, only a few words will be loaded onto each topic. This parameter is used instead of the \ topic_word_prior sklearn parameter, and will be passed along to sklearn using the formula: \ ``topic_word_prior = beta / num_topics``. :param learning_decay: See sklearn documentation. :param learning_offset: See sklearn documentation. :param learning_method: See sklearn documentation. :param max_iter: See sklearn documentation. :param batch_size: See sklearn documentation. :param verbose: See sklearn documentation. **sklearn_nmf** Fits a model using :py:class:`sklearn.decomposition.NMF`. For more information on available parameters, \ please refer to the official documentation: \ https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param alpha: See sklearn documentation. :param l1_ratio: See sklearn documentation. :param tol: See sklearn documentation. :param max_iter: See sklearn documentation. :param shuffle: See sklearn documentation. **gensim_lda** Fits an LDA model using :py:class:`gensim.models.LdaModel` or \ :py:class:`gensim.models.ldamulticore.LdaMulticore`. \ When ``use_multicore`` is set to True, the multicore implementation will be used, otherwise the standard \ LDA implementation will be used. \ For more information on available parameters, please refer to the official documentation below: - use_multicore=True: https://radimrehurek.com/gensim/models/ldamulticore.html - use_multicore=False: https://radimrehurek.com/gensim/models/ldamodel.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param alpha: Represents document-topic density. When values are higher, documents will be comprised of \ more topics; when values are lower, documents will be primarily comprised of only a few topics. Gensim \ options are a bit different than sklearn though; refer to the documentation for the accepted values here. :param beta: Represents topic-word density. When values are higher, topics will be comprised of more words; \ when values are lower, only a few words will be loaded onto each topic. Gensim options are a bit different \ than sklearn though; refer to the documentation for the accepted values here. Gensim calls this parameter \ ``eta``. We renamed it to be consistent with the sklearn implementations. :param chunksize: See gensim documentation. :param passes: See gensim documentation. :param decay: See gensim documentation. :param offset: See gensim documentation. :param workers: Number of cores to use (if using multicore) :param use_multicore: Whether or not to use multicore **gensim_hdp** Fits an HDP model using the gensim implementation. Contrary to LDA and NMF, HDP attempts to auto-detect the correct number of topics. In practice, it actually fits ``T`` topics (default is 150) but many are extremely rare or occur only in a very few number of documents. To identify the topics that are actually useful, this function passes the original :py:class:`pandas.DataFrame` through the trained model after fitting, and identifies \ topics that compose at least 1% of a document in at least 1% of all documents in the corpus. In other words, \ topics are thrown out if the number of documents they appear in at a rate of at least 1% are fewer than 1% of \ the total number of documents. Subsequent use of the model will only make use of topics that meet this \ threshold. For more information on available parameters, please refer to the official documentation: \ https://radimrehurek.com/gensim/models/hdpmodel.html :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param max_chunks: See gensim documentation. :param max_time: See gensim documentation. :param chunksize: See gensim documentation. :param kappa: See gensim documentation. :param tau: See gensim documentation. :param T: See gensim documentation. :param K: See gensim documentation. :param alpha: See gensim documentation. :param beta: See gensim documentation. :param gamma: See gensim documentation. :param scale: See gensim documentation. :param var_converge: See gensim documentation. **corex** Fits a CorEx topic model. Anchors can be provided in the form of a list of lists, with each item corresponding to a set of words to be used to seed a topic. For example: .. code-block:: python anchors=[ ['cat', 'kitten'], ['dog', 'puppy'] ] The list of anchors cannot be longer than the specified number of topics, and all of the words must exist in the vocabulary. The ``anchor_strength`` parameter determines the degree to which the model is able to override the suggested words based on the data; providing higher values are a way of "insisting" more strongly that the model keep the provided words together in a single topic. For more information on available \ parameters, please refer to the official documentation: https://github.com/gregversteeg/corex_topic :param df: The :py:class:`pandas.DataFrame` to train the model on (must contain ``self.text_col``) :param anchors: A list of lists that contain words that the model should try to group together into topics :param anchor_strength: The degree to which the provided anchors should be preserved regardless of the data """ fit_params = self.get_fit_params(**kwargs) if self.method in ["sklearn_lda", "sklearn_nmf"]: if self.method == "sklearn_lda": self.model = LatentDirichletAllocation( n_components=self.num_topics, **fit_params) if self.method == "sklearn_nmf": self.model = NMF(n_components=self.num_topics, **fit_params) if is_not_null(df): features = self.get_features(df) else: features = self.train_features self.model.fit(features) elif self.method in ["gensim_lda", "gensim_hdp"]: vocab_dict = dict([(i, s) for i, s in enumerate(self.ngrams)]) if is_not_null(df): features = self.get_features(df, keep_sparse=True) else: features = self.train_features matrix = gensim.matutils.Sparse2Corpus(features, documents_columns=False) if self.method == "gensim_lda": fit_params["num_topics"] = self.num_topics fit_params["id2word"] = vocab_dict if fit_params["use_multicore"]: model_class = gensim.models.ldamulticore.LdaMulticore else: model_class = gensim.models.LdaModel del fit_params["workers"] del fit_params["use_multicore"] self.model = model_class(**fit_params) self.model.update(matrix) elif self.method == "gensim_hdp": model_class = gensim.models.hdpmodel.HdpModel self.model = model_class(matrix, vocab_dict, **fit_params) doc_topics = self.get_document_topics(self.df) topics = ((doc_topics >= 0.01).astype(int).mean() >= 0.01).astype(int) self.topic_ids = [ int(col.split("_")[-1]) for col in topics[topics == 1].index if col.startswith("topic_") ] self.num_topics = len(self.topic_ids) elif self.method == "corex": if is_not_null(df): features = self.get_features(df, keep_sparse=True) else: features = self.get_features(self.train_df, keep_sparse=True) self.model = corextopic.Corex(n_hidden=self.num_topics) self.model.fit(features, words=self.ngrams, **fit_params)
df_top = df_product[df_p1_top_mask] df_p1_bot_mask = (df_product['vader'] < -0.25) & (df_product['nps'] == -1) df_bot = df_product[df_p1_bot_mask] # In[24]: cor_vectorizer = CountVectorizer(max_features=20000, ngram_range=(1, 2), binary=True, token_pattern="\\b[a-z][a-z]+\\b", stop_words='english') cor_doc_word_top = cor_vectorizer.fit_transform(df_top['nltk_terms']) cor_words = list(np.asarray(cor_vectorizer.get_feature_names())) topic_model_top = ct.Corex(n_hidden=6, words=cor_words, seed=1) topic_model_top.fit(cor_doc_word_top, words=cor_words, docs=df_top.nltk_terms) # repeat process for bottom reveiws: cor_doc_word_bot = cor_vectorizer.fit_transform(df_bot['nltk_terms']) cor_words = list(np.asarray(cor_vectorizer.get_feature_names())) topic_model_bot = ct.Corex(n_hidden=6, words=cor_words, seed=1) # must be repeated topic_model_bot.fit(cor_doc_word_bot, words=cor_words, docs=df_bot.nltk_terms) # In[25]: # Print all topics from the top topic model: topics = topic_model_top.get_topics()
import multiprocessing ######## cores = multiprocessing.cpu_count() ######## overall_list=list(itertools.chain.from_iterable(text_list)) model = Word2Vec([overall_list], min_count=1, iter=3, sg=1, hs=1, negative=2) #workers=cores, vector_size=100, window=8 #my current computer has 8 cores #min_count: ignores all words with total frequency lower than this #sg=0 is CBOW, sg=1 is Skip-gram #hs=1 employs hierarchical softmax #negative > 0 employs negative sampling. 2-5 for large datasets, 5-20 for small datasets # Train the CorEx topic model topic_model = ct.Corex(n_hidden=10, seed=seed) # Define the number of latent (hidden) topics to use. topic_model.fit(X, words=all_vocabs, docs=topics) top_docs = topic_model.get_top_docs() print ('\n') print ('COREX TOP DOCUMENTS:') for topic_n, topic_docs in enumerate(top_docs): docs,probs = zip(*topic_docs) topic_str = str('Topic ') + str(topic_n+1)+': ' + ''.join(str(docs)) print(topic_str) print ('\n') print ('xxx') print ('\n')
# Remove those entries that were removed from document term matrix for i in sorted(entries_for_remove, reverse=True): del chunked_bows[i] # Create a list from the features (features_list) so that they can be reused features_list = [element[1] for element in gensim_dic.iteritems()] # Transform the document term matrix into a binary matrix doc_word = np.where(matrix_documents > 0, 1, 0) print("\n") print ("The following topics were extracted from the Anglo-Saxon Chronicle") print("\n") # Run Corex topic modelling topic_model = ct.Corex(n_hidden=20, max_iter=200, verbose=False, seed=8) topic_model.fit(np.matrix(doc_word), words=features_list) # Print document topic matrix: topic_model.log_p_y_given_x # Print the key topics topics = topic_model.get_topics() topics_to_print = [] for n,topic in enumerate(topics): topic_words,_ = zip(*topic) topics_words_values = [] for element in topic: topics_words_values.append(element[0] + ' (' + str(np.round(element[1], decimals=3)) + ')') topics_to_print.append(','.join(topics_words_values)) print('{}: '.format(n) + ','.join(topic_words))
#%% model.fit(X, seed_topics=seed_topics, seed_confidence=0.9) #%% n_top_words = 20 topic_word = model.topic_word_ for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) # %% import numpy as np import scipy.sparse as ss from corextopic import corextopic as ct # Train the CorEx topic model topic_model = ct.Corex(n_hidden=len( enjeux_list)) # Define the number of latent (hidden) topics to use. topic_model.fit(X, words=vocab) # %% topics = topic_model.get_topics() for topic_n, topic in enumerate(topics): # w: word, mi: mutual information, s: sign topic = [(w, mi, s) if s > 0 else ('~' + w, mi, s) for w, mi, s in topic] # Unpack the info about the topic words, mis, signs = zip(*topic) # Print topic topic_str = str(topic_n + 1) + ': ' + ', '.join(words) print('\n' + topic_str) # %% import scipy.sparse as ss from corextopic import corextopic as ct
def run(): s3_path_in = os.environ['BATCHPAR_s3_path_in'] n_hidden = int(literal_eval(os.environ['BATCHPAR_n_hidden'])) # Load and shape the data s3 = boto3.resource('s3') s3_obj_in = s3.Object(*parse_s3_path(s3_path_in)) data = json.load(s3_obj_in.get()['Body']) # Pack the data into a sparse matrix ids = [] # Index of each row indptr = [0] # Number of non-null entries per row indices = [] # Positions of non-null entries per row counts = [] # Term counts/weights per position vocab = {} # {Term: position} lookup for row in data: ids.append(row.pop('id')) for term, count in row.items(): idx = vocab.setdefault(term, len(vocab)) indices.append(idx) counts.append(count) indptr.append(len(indices)) X = csr_matrix((counts, indices, indptr), dtype=int) # {Position: term} lookup _vocab = {v: k for k, v in vocab.items()} # Fit the model topic_model = ct.Corex(n_hidden=n_hidden) topic_model.fit(X) topics = topic_model.get_topics() # Generate topic names topic_names = { f'topic_{itop}': [_vocab[idx] for idx, weight in topic] for itop, topic in enumerate(topics) } # Calculate topic weights as sum(bool(term in doc)*{term_weight}) rows = [{ f'topic_{itop}': sum(row.getcol(idx).toarray()[0][0] * weight for idx, weight in topic) for itop, topic in enumerate(topics) } for row in X] # Zip the row indexes back in, and ignore small weights rows = [ dict(id=id, **{k: v for k, v in row.items() if v > WEIGHT_THRESHOLD}) for id, row in zip(ids, rows) ] # Curate the output output = { 'loss': topic_model.tc, 'data': { 'topic_names': topic_names, 'rows': rows } } # Mark the task as done and save the data if "BATCHPAR_outinfo" in os.environ: s3_path_out = os.environ["BATCHPAR_outinfo"] s3 = boto3.resource('s3') s3_obj = s3.Object(*parse_s3_path(s3_path_out)) s3_obj.put(Body=json.dumps(output))
'segment_keyword_matrix_merged_birkenau.txt', dtype=int) features_df = pd.read_csv(input_directory + 'keyword_index_merged_segments_birkenau.csv') segment_df = pd.read_csv(input_directory + 'segment_index_merged_birkenau.csv') features = features_df['KeywordLabel'].values.tolist() node_filters = constants.output_data_filtered_nodes + "node_filter_1_output.json" # Sparse matrices are also supported X = ss.csr_matrix(data) # Word labels for each column can be provided to the model # Document labels for each row can be provided #anchors=['camp adaptation methods'] # Train the CorEx topic model topic_model = ct.Corex( n_hidden=18) # Define the number of latent (hidden) topics to use. topic_model.fit(X, docs=segment_df.updated_id.tolist(), words=features) topics = topic_model.get_topics() for topic_n, topic in enumerate(topics): words, mis = zip(*topic) topic_str = str(topic_n + 1) + ': ' + ','.join(words) print(topic_str) top_docs = topic_model.get_top_docs() for topic_n, topic_docs in enumerate(top_docs): docs, probs = zip(*topic_docs) docs = [str(element) for element in docs] topic_str = str(topic_n + 1) + ': ' + ','.join(docs) print(topic_str)
vectorizer = TfidfVectorizer(max_df=.5, min_df=5, max_features=None, ngram_range=(1, 2), norm=None, binary=True, use_idf=True, sublinear_tf=False) vectorizer = vectorizer.fit(data_df['lemma']) tfidf = vectorizer.transform(data_df['lemma']) vocab = vectorizer.get_feature_names() N_TOPICS = 10 anchors = [] model = ct.Corex(n_hidden=N_TOPICS, seed=42) model = model.fit(tfidf, words=vocab) for i, topic_ngrams in enumerate(model.get_topics(n_words=20)): topic_ngrams = [ngram[0] for ngram in topic_ngrams if ngram[1] > 0] st.write("Topic #{}: {}".format(i + 1, ", ".join(topic_ngrams))) import scattertext as scatter_text st.header("Scatter Text") def get_scattertext_corpus(df, dep_data_col, group1_name, group2_name, lang="en"):