labelleft='off', left='off', right='off') digits = load_digits() images = digits['images'] images = [image.reshape((1, -1)) for image in images] images = np.concatenate(tuple(images), axis=0) topicsRange = [(i + 1) * 5 for i in range(10)] print(topicsRange) ldaModels = [ LDA(n_components=numTopics, learning_method='batch') for numTopics in topicsRange ] for lda in ldaModels: lda.fit(images) scores = [lda.perplexity(images) for lda in ldaModels] plt.plot(topicsRange, scores) plt.show() maxLogLikelihoodTopicsNumber = np.argmin(scores) plotNumbers = [4, 14, 24, 34, 44, 49] if maxLogLikelihoodTopicsNumber not in plotNumbers:
def train_model(self): get_messages_sql = s.sql.text( """ SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id FROM augur_data.repo r, augur_data.issues i, augur_data.message m, augur_data.issue_message_ref imr WHERE r.repo_id=i.repo_id AND imr.issue_id=i.issue_id AND imr.msg_id=m.msg_id UNION SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, pr.pull_request_id thread_id,m.msg_text,pr.pr_src_title thread_title,m.msg_id FROM augur_data.repo r, augur_data.pull_requests pr, augur_data.message m, augur_data.pull_request_message_ref prmr WHERE r.repo_id=pr.repo_id AND prmr.pull_request_id=pr.pull_request_id AND prmr.msg_id=m.msg_id """ ) msg_df_all = pd.read_sql(get_messages_sql, self.db, params={}) #select only highly active repos msg_df_all = msg_df_all.groupby("repo_id").filter(lambda x: len(x)>500) #combining all the messages in a repository to form a single doc msg_df = msg_df_all.groupby('repo_id')['msg_text'].apply(','.join) msg_df = msg_df.reset_index() #dataframe summarizing total message count in a repository message_desc_df = msg_df_all[["repo_id","repo_git","repo_name","msg_id"]].groupby(["repo_id","repo_git","repo_name"]).agg('count').reset_index() message_desc_df.columns = ["repo_id","repo_git", "repo_name", "message_count"] self.logger.info(msg_df.head()) tfidf_matrix, features = self.get_tf_idf_matrix(msg_df['msg_text'], self.max_df, self.max_features, self.min_df, self.ngram_range) msg_df['cluster'] = self.cluster_and_label(tfidf_matrix, self.num_clusters) #visualize_labels_PCA(tfidf_matrix.todense(), msg_df['cluster'], msg_df['repo_id'], 2, "MIN_DF={} and MAX_DF={} and NGRAM_RANGE={}".format(MIN_DF, MAX_DF, NGRAM_RANGE)) #LDA - Topic Modeling count_vectorizer = CountVectorizer(max_df=self.max_df, max_features=self.max_features, min_df=self.min_df,stop_words="english", tokenizer=self.preprocess_and_tokenize) #count_matrix = count_vectorizer.fit_transform(msg_df['msg_text']) count_transformer = count_vectorizer.fit(msg_df['msg_text']) count_matrix = count_transformer.transform(msg_df['msg_text']) pickle.dump(count_transformer.vocabulary_, open("vocabulary_count",'wb')) feature_names = count_vectorizer.get_feature_names() lda_model = LDA(n_components=self.num_topics) lda_model.fit(count_matrix) # each component in lda_model.components_ represents probability distribution over words in that topic topic_list = lda_model.components_ # Getting word probability # word_prob = lda_model.exp_dirichlet_component_ #word probabilities #lda_model does not have state variable in this library # topics_terms = lda_model.state.get_lambda() # topics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms) # word_prob = [lda_model.id2word[i] for i in range(topics_terms_proba.shape[1])] # Site explaining main library used for parsing topics: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html # Good site for optimizing: https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6 # Another Good Site: https://towardsdatascience.com/an-introduction-to-clustering-algorithms-in-python-123438574097 # https://machinelearningmastery.com/clustering-algorithms-with-python/ logging.info("Topic List Created: {}".format(topic_list)) pickle.dump(lda_model, open("lda_model",'wb')) logging.info("pickle dump") ## Advance Sequence SQL # key_sequence_words_sql = s.sql.text( # """ # SELECT nextval('augur_data.topic_words_topic_words_id_seq'::text) # """ # ) # twid = self.db.execute(key_sequence_words_sql) # self.logger.info("twid variable is: {}".format(twid)) #insert topic list into database topic_id = 1 for topic in topic_list: #twid = self.get_max_id('topic_words', 'topic_words_id') + 1 #self.logger.info("twid variable is: {}".format(twid)) for i in topic.argsort()[:-self.num_words_per_topic-1:-1]: #twid+=1 #self.logger.info("in loop incremented twid variable is: {}".format(twid)) #self.logger.info("twid variable is: {}".format(twid)) record = { #'topic_words_id': twid, #'word_prob': word_prob[i], 'topic_id': int(topic_id), 'word': feature_names[i], 'tool_source' : self.tool_source, 'tool_version' : self.tool_version, 'data_source' : self.data_source } result = self.db.execute(self.topic_words_table.insert().values(record)) self.logger.info("Primary key inserted into the topic_words table: {}".format(result.inserted_primary_key)) topic_id+=1 #insert topic list into database #save the model and predict on each repo separately prediction = lda_model.transform(count_matrix) topic_model_dict_list = [] for i, prob_vector in enumerate(prediction): topic_model_dict = {} topic_model_dict['repo_id'] = msg_df.loc[i]['repo_id'] for i, prob in enumerate(prob_vector): topic_model_dict["topic"+str(i+1)] = prob topic_model_dict_list.append(topic_model_dict) topic_model_df = pd.DataFrame(topic_model_dict_list) result_content_df = topic_model_df.set_index('repo_id').join(message_desc_df.set_index('repo_id')).join(msg_df.set_index('repo_id')) result_content_df = result_content_df.reset_index() self.logger.info(result_content_df) POS_count_dict = msg_df.apply(lambda row : self.count_func(row['msg_text']), axis = 1) msg_df_aug = pd.concat([msg_df,pd.DataFrame.from_records(POS_count_dict)], axis=1) self.logger.info(msg_df_aug)
def create_and_fit_lda(data, num_topics): lda = LDA(n_components=num_topics, n_jobs=-1) lda.fit(data) return lda
def main(): data1 = pandas.read_csv(r"liwc_input.csv") data2 = pandas.read_csv(r'liwc_test.csv') trainX = data1.iloc[:, 1:99] yTrain = data1.iloc[:, 99] testX = data2.iloc[:, 1:99] yTest = data2.iloc[:, 99] runBaseline = True #trainX, testX, yTrain, yTest = cross_validation.train_test_split(X, Y, test_size=0.1, random_state=0) #test train split vectorizer = feature_extraction.text.TfidfVectorizer() sentiment_scaler = preprocessing.StandardScaler() liwc_scaler = preprocessing.StandardScaler() unigrams = vectorizer.fit_transform(trainX["text"]).toarray() vectorizer1 = feature_extraction.text.TfidfVectorizer() #synst=vectorizer1.fit_transform(trainX["synset"].values.astype('U')).toarray() tf_vectorizer = feature_extraction.text.CountVectorizer() tf = tf_vectorizer.fit_transform(trainX["text"]).toarray() tf_feature_names = tf_vectorizer.get_feature_names() lda = LDA(n_topics=10, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(tf) lda_train = lda.transform(tf) sentiment = sentiment_scaler.fit_transform(trainX.ix[:, "pscore":"obscore"]) liwc = liwc_scaler.fit_transform(trainX.ix[:, "WC":"OtherP"]) allf = np.hstack((unigrams, lda_train, liwc, sentiment)) unigrams_t = vectorizer.transform(testX["text"]).toarray() liwc_t = liwc_scaler.fit_transform(testX.ix[:, "WC":"OtherP"]) tf_t = tf_vectorizer.transform(testX["text"]).toarray() lda_test = lda.transform(tf_t) sentiment_t = sentiment_scaler.transform(testX.ix[:, "pscore":"obscore"]) #3synst_t = vectorizer1.transform(testX["synset"].values.astype('U')).toarray() allf_t = np.hstack((unigrams_t, lda_test, liwc_t, sentiment_t)) features = { "sentiment": (sentiment, sentiment_t), "lda": (lda_train, lda_test), 'unigrams': (unigrams, unigrams_t), "liwc": (liwc, liwc_t), "all": (allf, allf_t) } for f in features: xTrain = features[f][0] xTest = features[f][1] if runBaseline: baseline = dummy.DummyClassifier(strategy='most_frequent', random_state=0) baseline.fit(xTrain, yTrain) predictions = baseline.predict(xTest) print(indent("Baseline: ", 4)) print(indent("Test Accuracy: ", 4), metrics.accuracy_score(yTest, predictions)) print(indent(metrics.classification_report(yTest, predictions), 4)) print() runBaseline = False print(indent("Features: ", 4), f) count = 0 ac = [0, 0, 0, 0, 0, 0, 0, 0] for model, name in zip(models, model_names): model.fit(xTrain, yTrain) # Simple SVM # print('fitting...') prediction = model.predict(xTest) # Print Accuracy print(model) print(indent("Test Accuracy: ", 4), metrics.accuracy_score(yTest, prediction)) print(indent(metrics.classification_report(yTest, prediction), 4)) print() # clf = SVC(C=20.0, gamma=0.00001) # clf.fit(X_train, y_train) # acc = clf.score(X_test, y_test) print() print()
# Initialise the count vectorizer with the English stop words count_vectorizer = CountVectorizer(stop_words='english') # Fit and transform the processed titles count_data = count_vectorizer.fit_transform(papers) print(count_data) plot_10_most_common_words(count_data, count_vectorizer) def print_topics(model, count_vectorizer, n_top_words): words = count_vectorizer.get_feature_names() for topic_idx, topic in enumerate(model.components_): print("\nTopic #%d:" % topic_idx) print(" ".join( [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) # Tweak the two parameters below number_topics = 5 number_words = 5 # Create and fit the LDA model lda = LDA(n_components=number_topics) lda.fit(count_data) lda.fit(count_data) # Print the topics found by the LDA model print("Topics found via LDA:") print_topics(lda, count_vectorizer, number_words)
def main(): warnings.simplefilter('ignore', DeprecationWarning) sns.set_style('whitegrid') #os.chdir('..') #make pd df from allegation summaries summary_file = open('data/raw/complaints.json', encoding="utf8") summary_data = json.load(summary_file) summary_file.close() summaries = [] en_core = spacy.load('en_core_web_lg') for dicts in summary_data: summaries.append(dicts["summary"]) summary_df = pd.DataFrame(summaries, columns=['summary']) stopwords = set([ 'cook county', 'chicago', 'allegedly', 'po', 'ofcer', 'sergeant', 'ipra', 'detective', 'accused', 'officer', 'officers', 'reporting', 'party', 'alleged', 'alleges', 'complainant', 'victim', 'police', '-pron-', 'pron', '-PRON-', '[-PRON-]', 'allege', 'accuse', 'report' ]) print('Preprocessing starting') #remove punctuation summary_df['summary_text_processed1'] = summary_df['summary'].map( lambda x: re.sub('[,\.!?]()', '', x)) #convert to lower summary_df['summary_text_processed2'] = summary_df[ 'summary_text_processed1'].map(lambda x: x.lower()) #lemmatize summary_df['summary_text_processed3'] = summary_df[ 'summary_text_processed2'].map( lambda x: " ".join([y.lemma_ for y in en_core(x)])) #remove years summary_df['summary_text_processed4'] = summary_df[ 'summary_text_processed3'].map( lambda x: re.sub('(19|20)[0-9][0-9]', '', x)) #remove stopwords summary_df['summary_text_processed'] = summary_df[ 'summary_text_processed4'].map(lambda x: " ".join( [item for item in x.split() if item not in stopwords])) print('Preprocessing finished') # Initialise the count vectorizer with the English stop words count_vectorizer = CountVectorizer(stop_words='english') # Fit and transform the processed titles count_data = count_vectorizer.fit_transform( summary_df['summary_text_processed']) NUM_ITERATIONS = 10 NUM_TOPICS = 20 NUM_WORDS = 12 for iteration in range(NUM_ITERATIONS): number_topics = NUM_TOPICS number_words = NUM_WORDS print('Training model ' + str(iteration)) lda = LDA(n_components=number_topics, max_iter=80) lda.fit(count_data) filename = 'lda_models_and_test_files/lda_model_iteration_' + str( iteration) + '_topics_' + str(number_topics) + '_words_' + str( number_words) filename_sav = filename + '.sav' filename_txt = filename + '.txt' if not os.path.exists('lda_models_and_test_files'): os.makedirs('lda_models_and_test_files') with open(filename_sav, 'wb') as f: pickle.dump(lda, f) with open(filename_txt, 'w+') as f: f.write('Topics found via LDA:') st = print_topics(lda, count_vectorizer, number_words) f.write(st) st = str(lda.transform(count_data[-10:])) f.write(st)
stop_words=STOP_WORDS, ngram_range=(1, 2), max_df=max_df, min_df=min_df) else: count_vectorizer = CountVectorizer(analyzer='word', stop_words=STOP_WORDS, max_df=max_df, min_df=min_df) # Fit and transform the processed titles count_data = count_vectorizer.fit_transform(messages) if alpha and eta: lda = LDA(n_components=n_clusters, n_jobs=-1, doc_topic_prior=alpha, topic_word_prior=eta) else: lda = LDA(n_components=n_clusters, n_jobs=-1) X = lda.fit_transform(count_data) # A sentence is included in the topic that represents the most. labels = np.argmax(X, axis=1) # Save clusters in given folders. save_clusters(messages, labels, output_dir) with open(os.path.join(output_dir, 'lda.model'), 'wb') as f: pickle.dump(lda, f) with open(os.path.join(output_dir, 'vect.model'), 'wb') as f: pickle.dump(count_vectorizer, f)
def compute_Semantics_1b(method, genre_, k_topics): '''Here the data is (genre X actors) with each cell having Tf-IDF values for that genre and actor''' print "\n\n\n============================================" #All genres _genres = MlMovies.objects.values_list('genres', flat=True) genres = [] for genre in _genres: genres.extend(genre.split(',')) genres = [x.strip() for x in genres] genres = list(set(genres)) #All actors actorobjs = ImdbActorInfo.objects.values_list('actorid','name') actors_dict = {x[0]:x[1] for x in actorobjs} actors = ImdbActorInfo.objects.values_list('actorid', flat=True) '''Matrix Dataset''' V = sp.lil_matrix((len(genres), len(actors))) decomposed = [] '''get tf-idfs vectors for each genre w.r.t actors''' for i in range(len(genres)): tf_idf = print_genreactor_vector.main(str(genres[i])) for j in range(len(actors)): V[i, j] = tf_idf[actors[j]] if (method.upper() == 'SVD'): ''' SVD Calculation ''' U, sigma, Vt = svds(V, k=k_topics) sigma = np.diag(sigma) # print "\n\nSigma = \t",sigma print "\n\nU:", len(U), len(U[0]), "Sigma: ", sigma.shape, " V: ", Vt.shape, "\n\n" #print U print "For genre",genre_,"Latent semantics are:", U[genres.index(genre_)] decomposed = U if (method.upper() == 'PCA'): # standardizing data V = sp.csr_matrix(V).todense() V_std = StandardScaler().fit_transform(V) print "Stdandardized size: ", V_std.shape '''PCA:: Using Inbuilt library function''' sklearn_pca = PCA(n_components=k_topics) pca = sklearn_pca.fit(V_std) Vt = pca.components_ # print Vt decomposed = pca.transform(V_std) print "For genre",genre_,"Latent semantics are:", decomposed[genres.index(genre_)] if (method.upper() == 'LDA'): '''TO:DO:// Create matrix with doc as rows and words as column s with each cell having freq count not tf-idf''' for i, gen in enumerate(genres): tobjects = Task5.objects.filter(genre=gen) t_actors_id = tobjects.values_list('actorid', flat=True) for j in range(len(actors)): aid = actors[j] if aid in t_actors_id: V[i, j] = tobjects.get(actorid=int(aid)).score else: V[i, j] = 0.0 lda = LDA(n_components=k_topics, max_iter=10000, learning_method="batch",evaluate_every=10,perp_tol=1e-12) lda.fit(V) Vt = lda.components_ decomposed = lda.transform(V) print "For genre",genre_,"Latent semantics are:", decomposed[genres.index(genre_)] '''SVD,PCA :: IN order to give Latenet Semantics some names: Normalize each column in feature factor matrix and then pick top 5 actors somewhat describing that Latent Semantic ''' #normed_Vt = normalize(Vt, axis=0, norm='max') #normed_Vt = Vt / Vt.sum(axis=0) normed_Vt = Vt.copy() x = normed_Vt.max(axis=0) y = normed_Vt.min(axis=0) for i in range(len(normed_Vt)): for j in range(len(normed_Vt[0])): normed_Vt[i][j] = float(normed_Vt[i][j] - y[j]) / float(x[j] - y[j]) for i in range(k_topics): idx = np.argpartition(-normed_Vt[i], 10)[:10] print "Latent Semantic: ", i + 1, " = " li = [] for j in idx: li.append(actors_dict[actors[j]]) print '\t', li, "\n" return decomposed
def compute_Semantics_1a(method, genre_,k_topics): """Here the data is (genre X tags) with each cell having Tf-IDF values for that genre and tag""" print "\n\n\n============================================" #All genres _genres = MlMovies.objects.values_list('genres', flat=True) genres = [] for genre in _genres: genres.extend(genre.split(',')) genres = [x.strip() for x in genres] genres = list(set(genres)) #All tags tagobjs = GenomeTags.objects.values_list('tagid','tag') tags_dict = {x[0]:x[1] for x in tagobjs} tags = GenomeTags.objects.values_list('tagid', flat=True) '''Matrix Dataset''' V = sp.lil_matrix((len(genres), len(tags))) decomposed = [] '''get tf-idfs vectors for genre-tag pairs and fill the matrix 0 if genre-tag doesn't exist''' for i in range(len(genres)): # tf_idf = compute_tf_idf_movie(cur_movie,"TF-IDF") tf_idf = print_genre_vector.main(str(genres[i]), 1) for j in range(len(tags)): V[i, j] = tf_idf[tags[j]] if(method.upper() == 'SVD'): ''' SVD Calculation ''' U, sigma, Vt = svds(V, k=k_topics) sigma = np.diag(sigma) # print "\n\nSigma = \t",sigma print "\n\nU:", len(U), len(U[0]), "Sigma: ", sigma.shape, " V: ", Vt.shape, "\n\n" #print U decomposed = U print "For genre",genre_,"Latent semantics are:", U[genres.index(genre_)] if(method.upper() == 'PCA'): # standardizing data V = sp.csr_matrix(V).todense() V_std = StandardScaler().fit_transform(V) #print "Stdandardized size: ", V_std.shape '''PCA:: Using Inbuilt library function''' sklearn_pca = PCA(n_components=k_topics) pca = sklearn_pca.fit(V_std) Vt = pca.components_ #print Vt decomposed = pca.transform(V_std) print "For genre",genre_,"Latent semantics are:", decomposed[genres.index(genre_)] if (method.upper() == 'LDA'): '''TO:DO:// Create matrix with doc as rows and words as column s with each cell having freq count not tf-idf''' for i, gen in enumerate(genres): tobjects = Task2.objects.filter(genre=gen) t_tags = tobjects.values_list('tag', flat=True) tags_dict1 = deepcopy(tags_dict) inv_tags = {v: k for k, v in tags_dict1.iteritems()} t_tags_id = [inv_tags[x] for x in inv_tags if x in t_tags] for j in range(len(tags)): tid = tags[j] if tid in t_tags_id: V[i, j] = tobjects.get(tag=str(tags_dict[tid])).score else: V[i, j] = 0.0 lda = LDA(n_components=k_topics, max_iter=10000, learning_method="batch",evaluate_every=10,perp_tol=1e-12) lda.fit(V) Vt = lda.components_ decomposed = lda.transform(V) print "For genre",genre_,"Latent semantics are:", decomposed[genres.index(genre_)] '''IN order to give Latenet Semantics some names: Normalize each column in feature factor matrix and then pick top 5 tags somewhat describing that Latent Semantic ''' #normalize columns for most discriminating feature finding #normed_Vt = Vt/Vt.sum(axis=0) normed_Vt = Vt.copy() x = normed_Vt.max(axis=0) y = normed_Vt.min(axis=0) for i in range(len(normed_Vt)): for j in range(len(normed_Vt[0])): normed_Vt[i][j] = float(normed_Vt[i][j] - y[j]) / float(x[j] - y[j]) #print "\n\nHo ho!!\n", normed_Vt #print tags_dict for i in range(k_topics): idx = np.argpartition(-normed_Vt[i], 10)[:10] # print "What is this?", -np.partition(-normed_Vt[0], 5)[:5] #rint idx print "Latent Semantics: ", i + 1, " = " li = [] for j in idx: li.append(tags_dict[tags[j]]) print '\t', li, "\n" return decomposed
count_data = count_vectorizer.fit_transform(gd['pro_text']) vocab = count_vectorizer.get_feature_names() word2id = dict((v, idx) for idx, v in enumerate(vocab)) #pickle.dump(vocab,open('/home/mcorrito/projects/def-mcorrito/mcorrito/HH/temp_data/vocab.pkl','w')) #pickle.dump(word2id,open('/home/mcorrito/projects/def-mcorrito/mcorrito/HH/temp_data/word2id.pkl','w')) #scipy.sparse.save_npz('/home/mcorrito/projects/def-mcorrito/mcorrito/HH/temp_data/count_data.npz', count_data) #count_data = scipy.sparse.load_npz('/home/mcorrito/projects/def-mcorrito/mcorrito/HH/temp_data/count_data.npz') #count_data = count_data.toarray() # Create and fit the LDA model lda = LDA(n_components=number_topics, n_jobs=-1, random_state=182, learning_method='online') fit = lda.fit(count_data) topic_weights = lda.transform(count_data) weights = pd.DataFrame(topic_weights) final = pd.concat([ids, weights], axis=1, ignore_index=True) final.to_csv('~/projects/def-mcorrito/mcorrito/HH/data/lda_weights.csv', index=False, header=False) # Print the topics found by the LDA model print_topics(lda, count_vectorizer, number_words) # # Guided LDA with seed topics. # seed_topic_list = [['innov','experi','experiment','dynam','fast','creativ']]
def LDA_modeling(corona_body_all_text): corpus=[] for text in corona_body_all_text: corpus.append(' '.join(text)) # the number of terms included in the bag of words matrix is restricted to the top 100 no_features=100 tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english') corpus_matrix = tf_vectorizer.fit_transform(corpus) print(corpus_matrix.shape) # the most representative top 1500 words in corona_body_all_text # the basic structure is like a python dictionary word_feature_list = tf_vectorizer.get_feature_names() """ this is for the grid search """ search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]} lda = LDA() model= GridSearchCV(lda, param_grid=search_params, n_jobs=6) model.fit(corpus_matrix) best_lda_model=model.best_estimator_ # Model Parameters print("Best Model's Params: ", model.best_params_) # Log Likelihood Score print("Best Log Likelihood Score: ", model.best_score_) # Perplexity print("Model Perplexity: ", best_lda_model.perplexity(corpus_matrix)) # Get Log Likelyhoods from Grid Search Output n_topics = [10, 15, 20, 25, 30] log_likelyhoods_5 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['param_learning_decay']) if gscore == 0.5] log_likelyhoods_7 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['param_learning_decay']) if gscore== 0.7] log_likelyhoods_9 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['param_learning_decay']) if gscore== 0.9] # Show graph plt.figure(figsize=(12, 8)) plt.plot(n_topics, log_likelyhoods_5, label='0.5') plt.plot(n_topics, log_likelyhoods_7, label='0.7') plt.plot(n_topics, log_likelyhoods_9, label='0.9') plt.title("Choosing Optimal LDA Model") plt.xlabel("Num Topics") plt.ylabel("Log Likelyhood Scores") plt.legend(title='Learning decay', loc='best') plt.savefig('learning_decay.pdf') plt.show() # Create Document - Topic Matrix lda_output = best_lda_model.transform(corpus_matrix) # column names topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)] # index names docnames = ["Doc" + str(i) for i in range(len(corona_body_all_text))] # Make the pandas dataframe df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames) # Get dominant topic for each document dominant_topic = np.argmax(df_document_topic.values, axis=1) df_document_topic['dominant_topic'] = dominant_topic # Styling def color_green(val): color = 'green' if val > .1 else 'black' return 'color: {col}'.format(col=color) def make_bold(val): weight = 700 if val > .1 else 400 return 'font-weight: {weight}'.format(weight=weight) # Apply Style df_document_topics = df_document_topic.head(len(corona_body_all_text)).style.applymap(color_green).applymap(make_bold) df_document_topics.to_excel("df_document_topics.xlsx") print(df_document_topics) # Review topics distribution across documents df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents") df_topic_distribution.columns = ['Topic Num', 'Num Documents'] df_topic_distribution.to_excel("df_topic_distribution.xlsx") print(df_topic_distribution) panel = pyLDAvis.sklearn.prepare(best_lda_model, corpus_matrix, tf_vectorizer, mds='tsne') pyLDAvis.save_html(panel,'LDA_Visualization.html') # Topic-Keyword Matrix df_topic_keywords = pd.DataFrame(best_lda_model.components_) # Assign Column and Index df_topic_keywords.columns = tf_vectorizer.get_feature_names() df_topic_keywords.index = topicnames # View df_topic_keywords.head() # Show top n keywords for each topic def show_topics(vectorizer, lda_model, n_words): keywords = np.array(vectorizer.get_feature_names()) topic_keywords = [] for topic_weights in lda_model.components_: top_keyword_locs = (-topic_weights).argsort()[:n_words] topic_keywords.append(keywords.take(top_keyword_locs)) return topic_keywords topic_keywords = show_topics(vectorizer=tf_vectorizer, lda_model=best_lda_model, n_words=15) # Topic - Keywords Dataframe df_topic_keywords = pd.DataFrame(topic_keywords) df_topic_keywords.columns = ['Word ' + str(i) for i in range(df_topic_keywords.shape[1])] df_topic_keywords.index = ['Topic ' + str(i) for i in range(df_topic_keywords.shape[0])] df_topic_keywords.to_excel("df_topic_keywords.xlsx") print(df_topic_keywords) """ this is for printing the content of all the topics """ # no_topics = 20 # lda = LDA(n_components=no_topics, learning_method='online', learning_offset=50., # random_state=0).fit(tf) # # # def display_topics(model, feature_names, no_top_words): # # for topic_idx, topic in enumerate(model.components_): # print("\nTopic #%d:" % topic_idx) # print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]])) # print() # no_top_words = 10 # display_topics(lda, tf_feature_names, no_top_words) sns.set_style('whitegrid') # Helper function def plot_10_most_common_words(count_data, count_vectorizer): words = count_vectorizer.get_feature_names() total_counts = np.zeros(len(words)) # count_data is like a Counter dictionary for t in count_data: total_counts += t.toarray()[0] count_dict = (zip(words, total_counts)) count_dict = sorted(count_dict, key=lambda x: x[1], reverse=True)[0:10] words = [w[0] for w in count_dict] counts = [w[1] for w in count_dict] x_pos = np.arange(len(words)) plt.figure(2, figsize=(15, 15 / 1.6180)) plt.subplot(title='10 most common words') sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5}) sns.barplot(x_pos, counts, palette='husl') plt.xticks(x_pos, words, rotation=90) plt.xlabel('words') plt.ylabel('counts') plt.savefig('10_most_common_words.pdf') plt.show() # Visualise the 10 most common words plot_10_most_common_words(corpus_matrix, tf_vectorizer) corpus_matrix=corpus_matrix.toarray() return corpus_matrix,word_feature_list
def fit(self, X, y=None, features=None): """ Constructs DAG according to `self.dag_method` and learns coexpression modules across multiple resolutions. Parameters ---------- X: `numpy.ndarray` or `scipy.sparse.csr_matrix` Matrix with rows corresponding to all of the samples that define the DAG and columns corresponding to features that define the correlation matrices. y Ignored features: `numpy.ndarray` of `str` A list of strings with feature labels. """ super(DecomposeDAG, self).fit(X, y, features) n_samples, n_features = X.shape if self.verbose: print('Stacking...') sys.stdout.flush() X_multi = self.multiresolution_stack(X) if self.verbose: print('Decomposing...') sys.stdout.flush() if self.decomp_method == 'nmf': #from sklearn.decomposition import NMF from nmf import NMF decomp = NMF( n_components=self.n_components, init=None, solver='cd', beta_loss='frobenius', alpha=1e-3, l1_ratio=1, random_state=69, tol=1e-2, verbose=self.verbose, ).fit(X_multi) components = decomp.components_ elif self.decomp_method == 'lda': from sklearn.decomposition import (LatentDirichletAllocation as LDA) decomp = LDA( n_components=self.n_components, learning_method='online', max_iter=20, mean_change_tol=1e-2, n_jobs=self.n_jobs, random_state=69, verbose=self.verbose, ).fit(X_multi) components = decomp.components_ elif self.decomp_method == 'hdp': from bnp.online_hdp import (HierarchicalDirichletProcess as HDP) hdp = HDP( n_topic_truncate=self.n_components, n_doc_truncate=10000, learning_method='online', n_jobs=self.n_jobs, random_state=69, verbose=self.verbose, ).fit(X_multi) components = hdp.lambda_ else: raise ValueError('Invalid decomposition method {}'.format( self.decomp_method)) n_components = components.shape[0] self.cluster_components = np.reshape( components, (n_components, n_features, len(self.nodes))) cc = np.sum(self.cluster_components, axis=1) cc /= cc.max() assert (cc.shape == (n_components, len(self.nodes))) for node_idx, node in enumerate(self.nodes): node.viz_value = list(cc[:, node_idx]) return self
# "restaurants and television shows are his ticket. Seb also has a rep for being " # "one of the hottest catches around.Well from here on out you get the gist of it a" # "ll. They meet; sparks fly as Lexi doesn't fall at Sebs' feet. He is intrigued to " # "find a girl who serves him up on a platter cold. No his usual dish so to speak." # " (I know puns aplenty today) *HA!There is a bit of mystery when notes keep " # "appearing with a threatening overture. Lexi is attacked and her deliveries are " # "beginning to not show up.My thoughts on this debut title......4 Stars on the " # "love connection. It was cute and spunky. They had sizzle and Seb is very much " # "an alpha male. Always a good thing.2.5 Stars on the mystery/suspense. It was " # "easy to figure out what was going on from the beginning. I wish the author had" # " stuck to one plotline with this. There were too many little side twists that " # "in my opinion wasn't needed and some plots were left unfinished. It is possible" # " that this will be resolved in later books but it just felt incomplete in this" # " one.There was a bit of filler in the book as well. I didn't need all the " # "explanations' on cutting styles for chops and how the kitchen runs. I am sure " # "some will find it informative but for me it just took away from the story and " # "well...filler.I do see this series getting better and I did like the writing " # "style very much. Will I be looking out for the next book? Sure I will :)3 StarsT~"] x = x.toarray()[0][:50].reshape(1, -1) print(x) print(x.shape) lda = LDA(n_topics=20) lda_res = lda.fit_transform(x) # lda_res = lda.transform(lda_res) print(type(lda_res)) a = lda_res[0, :] b = lda_res[0, :] print(data.Hellinger_distance(a, b)) # print(lda_res.split(' '))
def main(): if style == 0: dataset = loadData() random.shuffle(dataset) ## to get the part of dataset ## dataset = dataset[:DATA_NUM] ## to filter data for data in dataset: data[0] = dataFilter(data) if isSetVocabulary == 1: save(vocabulary, 'Vocabulary') print('building vocabulary is completed\nvocabulary size =', len(vocabulary)) print('filtering data is completed') ## global vocabulary ## save(vocabulary, 'Vocabulary') ## print('vocabulary size =', len(vocabulary)) ## global vocabulary ## vocabulary = load('Vocabulary') trainingData = dataset[:DATA_NUM] trainingTargetedClassifierFeatures = [ (buildTargetedClassifierFeatures(data[0], data[1])) for data in trainingData ] save(trainingTargetedClassifierFeatures, 'trainingTargetedClassifierFeatures') print('training targeted classifier features are completed') ## to train the sentiment models global classifiers print('classifiers training start!') ## to use the naive bayes classifier to train. [ [{}, ''] , ....] classifiers.append( nltk.NaiveBayesClassifier.train( trainingTargetedClassifierFeatures)) ## to try other classifier ## MultinomialNB classifiers.append(SklearnClassifier(MultinomialNB())) ## BernoulliNB classifiers.append(SklearnClassifier(BernoulliNB())) ## LogisticRegression classifiers.append(SklearnClassifier(LogisticRegression())) ## SGDClassifier classifiers.append(SklearnClassifier(SGDClassifier())) ## SVC classifiers.append(SklearnClassifier(SVC())) ## LinearSVC classifiers.append(SklearnClassifier(LinearSVC())) ## NuSVC classifiers.append(SklearnClassifier(NuSVC())) ## to train the classifier length = len(classifiers) ## print(length) ## except naive bayes classifier for i in range(1, length): classifiers[i].train(trainingSet) ## to use our vote classifier voteClassifier = VoteClassifier(classifiers[0], classifiers[1], classifiers[2], classifiers[3], classifiers[4], classifiers[5], classifiers[6], classifiers[7]) classifiers.append(voteClassifier) print('classifiers training end!') ## to predict testData = dataset[DATA_NUM:] testClassifierFeatures = [(buildClassifierFeatures(data[0])) for data in testData] save(testClassifierFeatures, 'testClassifierFeatures') print('test classifier features are completed') print('predicting start') predictions = [] for feature in testClassifierFeatures: predictions.append([ voteClassifier.classify(feature), voteClassifier.confidence(feature) ]) for prediction in predictions: print("Classification: ", prediction[0], " Confidence: ", prediction[1]) ## to add to training targetd classifier features if confidence > threshold newTestClassifierFeatures = [] newTestData = [] for i in range(len(predictions)): if predictions[i][1] > threshold: trainingData.append(testData[i]) trainingTargetedClassifierFeatures.append( [testClassifierFeatures[i], predictions[i][0]]) else: newTestData.append(testData[i]) newTestClassifierFeatures.append(testClassifierFeatures[i]) testData = newTestData testClassifierFeatures = newTestClassifierFeatures ## to train the LDA zeros = [0 for n in range(len(vocabulary))] trainingLDAFeatures = [(buildLDAFeatures(zeros, data[0])) for data in trainingData] print('LDA start') model = LDA(n_topics=2, max_iter=1500, learning_method='online') topicDistributions = model.fit_transform(trainingLDAFeatures) print('LDA end') save(model, 'LDAModel') for distribution in topicDistributions: print(distribution) testFeatures = [ buildFeatures(zeros, i, testData[i][0]) for i in range(len(testData)) ] print(model.transform(testFeatures)) print('ground truth') for data in testData: print(data[1]) ## topic = model.components_ ## print(topic) n_top_words = 8 for i, topic_dist in enumerate(model.components_): topic_words = np.array(vocabulary)[np.argsort( topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) elif style == 1: print('YO')
tfidf_model = tfidf_vectorizer_small.fit(text) with open("../../4_models/tfidf_50K_influential_reviews_10191994.pickle", "wb") as f: pickle.dump(tfidf_model, f) tfidf_bigram_model = tfidf_bigram_vectorizer_small.fit(text) with open("../../4_models/tfidf_bigram_50K_influential_reviews_10191994.pickle", "wb") as f: pickle.dump(tfidf_bigram_model, f) tfidf = tfidf_vectorizer.fit_transform(text) tfidf_feature_names = tfidf_vectorizer.get_feature_names() start1 = time.time() lda = LDA(n_components=num_topics, max_iter=5, learning_method="online", learning_offset=50, random_state=10191994).fit(tf) # save the LDA model with open("../../4_models/lda_50K_influential_reviews_10191994.pickle", "wb") as f: pickle.dump(lda, f) end1 = time.time() print("LDA: {}".format(end1 - start1)) start2 = time.time() nmf = NMF(n_components=num_topics, random_state=10191994, alpha=.1, l1_ratio=.5,
labels.append(key.split("_")[0]) # print(data[0][:5]) # data = data[16:] # labels = labels[16:] # 计算文档的词频向量 fileVector = CountVectorizer(stop_words=stopwords) fileTfVector = fileVector.fit_transform(data) print(fileTfVector.shape) # LDA训练 使用16部小说进行训练 topic = args.k model = LDA(n_components=topic, max_iter=50, learning_method='batch') docres = model.fit_transform(fileTfVector[:16]) # print(docres) value, indices = torch.max(torch.tensor(docres), 1) print(indices) print("{}个主题识别出来了{}个主题".format(topic, len(list(set(indices.tolist()))))) # print(len(model.components_)) res = model.transform(fileTfVector) assert len(res) == len(labels) df_labels = pd.DataFrame(labels) # df_labels.to_excel("labels.xlsx") df_res = pd.DataFrame(res) # df_res.to_excel("ldaVector.xlsx") df = pd.concat([df_labels, df_res], axis=1)
pb.update() df.columns=['review','sentiment'] df=df.reindex(np.random.permutation[df.index]) df.to_csv('movie_data.csv',encoding='utf-8',index=False) df=pd.read_csv('movie_data.csv') print(df.head(),'\n',len(df.index)) print(df['sentiment'][:5]) from sklearn.decomposition import LatentDirichletAllocation as LDA from sklearn.feature_extraction.text import CountVectorizer count=CountVectorizer(max_df=0.1,max_features=5000,stop_words='english') help(CountVectorizer) help(LDA) lda=LDA(n_components=10,learning_method='batch',random_state=123) X=count.fit_transform(df['review'].values) X_topics=lda.fit_transform(X) print(X_topics.shape) # (50000, 10) # Components are matrix of topics X words/features print(lda.components_.shape) # (10, 5000) n_top_words=5 features=count.get_feature_names() print(len(features)) # 5000 for idx,topics in enumerate(lda.components_): print('Topic : ',(idx+1)) print([features[i] for i in topics.argsort()[-n_top_words:]]) b=np.array([1,3,5,7,2,9,4])
h = .02 # step size in the mesh names = [ "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA" ] classifiers = [ KNeighborsClassifier(3), SVC(kernel="linear", C=0.025), SVC(gamma=2, C=1), DecisionTreeClassifier(max_depth=5), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1), AdaBoostClassifier(), GaussianNB(), LDA(), QDA() ] X, y = make_classification(n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1) rng = np.random.RandomState(2) X += 2 * rng.uniform(size=X.shape) linearly_separable = (X, y) datasets = [ make_moons(noise=0.3, random_state=0), make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable
# After obtaining the final distances between each abstract in the data set of papers, we apply a clustering method to sort each paper into just one latent topic. Since we only have distances between documents as opposed to points in space, we apply k-medoids clustering to the distance matrix. # # The following chunks of code calculate distances between the abstracts using LDA, calculate the distances between abstracts using LSA, average the two distance matrices, and apply k-medoids. For LDA we use 7 clusters, having the maximum coherence score (code omitted from this write-up). For LSA, we reduce $w$ to 14-dimensions. Following LDA, we perform k-medoids clustering using 7 clusters. # In[4]: # LDA from sklearn.decomposition import LatentDirichletAllocation as LDA from scipy.spatial import distance import random random.seed(123) numberTopics = 7 lda = LDA(n_components=numberTopics, random_state=0) ldaFit = lda.fit(w) topicDistributions = lda.transform(w) distsLDA = distance.cdist(topicDistributions, topicDistributions, 'euclidean') # In[5]: # LSA from sklearn.decomposition import TruncatedSVD from sklearn.metrics.pairwise import cosine_similarity random.seed(234) n_components = 14
def modeling_lda(x,num_t): lda=LDA(n_components=num_t,random_state=0) lda.fit(x) return lda
def main(): data = pandas.read_csv(r"newsenti1.csv") X = data.iloc[:, 1:] Y = data.iloc[:, 0] runBaseline = True trainX, testX, yTrain, yTest = cross_validation.train_test_split( X, Y, test_size=0.1, random_state=0) #test train split vectorizer = feature_extraction.text.TfidfVectorizer() sentiment_scaler = preprocessing.StandardScaler() unigrams = vectorizer.fit_transform(trainX["text"]).toarray() vectorizer1 = feature_extraction.text.TfidfVectorizer() synst = vectorizer1.fit_transform( trainX["synset"].values.astype('U')).toarray() tf_vectorizer = feature_extraction.text.CountVectorizer() tf = tf_vectorizer.fit_transform(trainX["text"]).toarray() tf_feature_names = tf_vectorizer.get_feature_names() lda = LDA(n_topics=10, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(tf) lda_train = lda.transform(tf) sentiment = sentiment_scaler.fit_transform(trainX.ix[:, "pscore":"obscore"]) allf = np.hstack((unigrams, lda_train, synst, sentiment)) unigrams_t = vectorizer.transform(testX["text"]).toarray() tf_t = tf_vectorizer.transform(testX["text"]).toarray() lda_test = lda.transform(tf_t) sentiment_t = sentiment_scaler.transform(testX.ix[:, "pscore":"obscore"]) synst_t = vectorizer1.transform( testX["synset"].values.astype('U')).toarray() allf_t = np.hstack((unigrams_t, lda_test, synst_t, sentiment_t)) features = {"All_f": (allf, allf_t)} for f in features: xTrain = features[f][0] xTest = features[f][1] if runBaseline: baseline = dummy.DummyClassifier(strategy='most_frequent', random_state=0) baseline.fit(xTrain, yTrain) predictions = baseline.predict(xTest) print(indent("Baseline: ", 4)) print(indent("Test Accuracy: ", 4), metrics.accuracy_score(yTest, predictions)) print(indent(metrics.classification_report(yTest, predictions), 4)) print() runBaseline = False print(indent("Features: ", 4), f) for m, model in enumerate(models): hyp = clf_hyp[m] pipe = pipeline.Pipeline([('clf', model)]) if len(hyp) > 0: grid = grid_search.GridSearchCV( pipe, hyp, cv=10, n_jobs=6) #grid search for best hyperparameters grid.fit(xTrain, yTrain) predictions = grid.predict(xTest) print(indent(type(model).__name__, 6)) print(indent("Best hyperparameters: ", 8), grid.best_params_) print(indent("Validation Accuracy: ", 8), grid.best_score_) print(indent("Test Accuracy: ", 8), metrics.accuracy_score(yTest, predictions)) print( indent(metrics.classification_report(yTest, predictions), 8)) else: grid = model grid.fit(xTrain, yTrain) predictions = grid.predict(xTest) print(indent(type(model).__name__, 6)) print(indent("Test Accuracy: ", 8), metrics.accuracy_score(yTest, predictions)) print( indent(metrics.classification_report(yTest, predictions), 8)) print() print()
def lda(X, n_components = 4): p = LDA(n_components = n_components) return p.fit_transform(X), p
'newsletter', 'mit', 'subscribe', 'blockchain', 'karen', 'hao', 'will', 'knight', 'technologies' ]) len(articles.index) all_list = [] for i in range(len(articles.index)): words = CountVectorizer(max_df=10, min_df=1, max_features=1000, stop_words=stopwords) bag_of_words = words.fit_transform( split_text_into_paras(articles.iloc[i]['Article '])) word_names = words.get_feature_names() lda = LDA(n_components=2).fit(bag_of_words) all_list.append(([i] + display_topics(lda, word_names, 5))) all_list import numpy as np from os import path from PIL import Image from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS stopwords = set(STOPWORDS) stopwords.update([ 'newsletter', 'mit', 'subscribe', 'blockchain', 'karen', 'hao', 'will', 'knight', 'technologies', 'say', 'people', 'technology', 'algorithm', 'says', 'one' ])
filenames = ['laDuda.txt', 'eljorge.txt', 'llecspier.txt', 'vairon.txt'] vectorizer = CountVectorizer(input='filename', ngram_range=(1, 3), stop_words="english") dtm = vectorizer.fit_transform(filenames) # a sparse matrix tfidf_transformer = TfidfTransformer(norm='l2') x_tfidf = tfidf_transformer.fit_transform(dtm) matVoc = x_tfidf.toarray() vocab = vectorizer.get_feature_names() # a list vocab = np.array(vocab) lda = LDA(n_components=3, random_state=0) lda_array = lda.fit_transform(matVoc) labels = [np.argmax(x) for x in lda_array] print(lda_array) colores = ["r", "b", "c", "y"] autores = ["Poema", "Chapman", "Shakespeare", "Lord Byron"] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for i, punto in enumerate(lda_array): ax.scatter(punto[0], punto[1],
import matplotlib.pyplot as plt featurized_data = pd.read_csv('~/capstone_project/data/featurized_data.csv') test = featurized_data[featurized_data.min_game > 10] test_players = test[['player_id', 'display_name']] test.drop(['Unnamed: 0', 'player_id', 'display_name'], inplace=True, axis=1) features = featurized_data.columns test_players.set_index('player_id', inplace=True, drop=True) test.fillna(0, inplace=True) test = normalize(test) count_topics = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] lda = LDA(n_topics=4, max_iter=5, learning_method='online', learning_offset=50., random_state=0) lda.fit(test) # for i in [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]: # KMeans_test = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=300, tol=0.0001, \ # precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm='auto') # KMeans_test.fit(players_by_topic) # test_labels = KMeans_test.labels_ # players_clustered = pd.DataFrame(players_by_topic) # players_clustered['cluster'] = test_labels # players_clustered['player_name'] = player_info['display_name'] # score = silhouette_score(players_by_topic, test_labels, metric='euclidean',sample_size=None) # print score
#Get column names for timbre and harmony features columns = data.columns timbre_col = [col for col in columns if re.search("Timbre", col)] harm_col = [col for col in columns if re.search("Harm", col)] #Create matrices of timbre and harmony features timbre_features = data[timbre_col] timbre_features = timbre_features.to_numpy() harm_features = data[harm_col] harm_features = harm_features.to_numpy() #Apply LDA on both separately lda_timbre = LDA(n_components=t_topics, doc_topic_prior=doc_topic_prior, random_state=0) lda_timbre.fit(timbre_features) topics_timbre = lda_timbre.transform(timbre_features) lda_harm = LDA(n_components=h_topics, doc_topic_prior=doc_topic_prior, random_state=0) lda_harm.fit(harm_features) topics_harm = lda_harm.transform(harm_features) #Summary of the top components of topics def get_top_components(model, feature_names, n): #n is the number of top components you want to output
path = dph.getDataPath('pressBiTriLemma.json') df = pd.read_json(path) dfBigramLemma = df[columnName] df2 = pd.DataFrame(dfBigramLemma) df2[columnName] = df2.apply(lambda row: ' '.join(map(str, row[columnName])), axis=1) vectorizer = TfidfVectorizer(strip_accents = 'unicode', ngram_range = (1,2)); xTrainTfidf = vectorizer.fit_transform(df2[columnName]); searchParams = {'n_components': [10], 'learning_decay': [.5]} if True: model = LDA() model = GridSearchCV(model, searchParams) model.fit(xTrainTfidf) model = model.best_estimator_ if False: dph.saveModel(model, 'ldaGrid' + columnName) else: model = dph.loadModel('ldaGrid' + columnName) # Zeigt den Socre print("Model perplexity: ", model.perplexity(xTrainTfidf)) # Ermittelt die Werte für die nächsten Funktionen featureNames = vectorizer.get_feature_names() weights = model.components_
start=2, limit=40, step=6) limit = 40 start = 2 step = 6 x = range(start, limit, step) plt.plot(x, coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend(("coherence_values"), loc='best') plt.show() """***TRANSFORMATION OF THE TEXT USING LDA FROM BOW***""" lda = LDA(n_components=25, n_jobs=-1) array_lda_Bow = lda.fit_transform(X_Bow) X_lda_Bow = pd.DataFrame(array_lda_Bow) X_lda_Bow """***TRANSFORMATION OF THE TEXT USING LDA FROM TFID***""" lda = LDA(n_components=25, n_jobs=-1) array_lda_Tfid = lda.fit_transform(X_Tfid) X_lda_Tfid = pd.DataFrame(array_lda_Tfid) X_lda_Tfid """***CONCATENATION OF MODELS***""" concat_lda = np.concatenate((array_tfid, array_lda_Tfid), axis=1)
with open(os.path.join(PICKLE_DIR,'clues_df.p'),'rb') as f: clues = pickle.load(f) except: exec(open('./process_puz.py').read()) try: with open(os.path.join(PICKLE_DIR,'lda_fit.p'),'rb') as f: pickle.load(f) except: nclues = clues.shape[0] ntext = int(.1*nclues) clue_samples = np.random.choice(range(nclues),ntext) count_vectorizer = CountVectorizer(stop_words='english') data = count_vectorizer.fit_transform(clues.clue_text.iloc[clue_samples]) number_topics = 4 lda = LDA(n_components=number_topics, n_jobs=-1) lda.fit(data) with open(os.path.join(PICKLE_DIR,'lda_fit.p'),'wb') as f: pickle.dump(lda,f) LDAvis_prepared = sklearn_lda.prepare(lda, data, count_vectorizer) with open(os.path.join(PICKLE_DIR,'ldavis.p'), 'wb') as f: pickle.dump(LDAvis_prepared, f) pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html')
if(word != '' and not is_other(word)): words_not_other.append(word) # Convert a collection of words to a matrix of token counts print_status("Counting ngrams...") # vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), binary=True) vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 5), binary=True) vectorized_train_data = vectorizer.fit_transform(X_train) vectorized_dev_data = vectorizer.transform(words_not_other) # Create and fit the LDA model print_status("Training LDA...") number_topics = 2 lda_model = LDA(n_components=number_topics, max_iter=100, random_state=123) lda_model.fit(vectorized_train_data) lda = lda_model.transform(vectorized_dev_data) # Decide labels that belong to each cluster cluster_0_label = '' cluster_1_label = '' # Get indexes of words that represent better cluster 0 cluster_0 = lda[:,0] top_n_words_c0_idx = (-cluster_0).argsort()[:10] # Check in which language these words belong to count_lang1 = 0 count_lang2 = 0 for i in top_n_words_c0_idx: word = words_not_other[i]