chunksize=batch_size, iterations=max_e_steps, eval_every=eval_every) gn_time = time.time() - start log_prep_gensim_mc = lda_gensim_mc.log_perplexity(gensim_te_corpus) preplexity_gensim_mc = np.exp(-1. * log_prep_gensim_mc) print("gensim run time and perplexity: {}, {}".format(gn_time, preplexity_gensim_mc)) print("sklearn run time and perplexity: {}, {}".format(sk_time, sklearn_perplexity)) # Lets have a look to the topics topic_words = dict() gensim_topics = lda_gensim_mc.show_topics(formatted=False) def sklearn_show_topics(model, feature_names, n_top_words): sk_topics = [] for topic_idx, topic in enumerate(model.components_): tot_score = np.sum(topic) top_words = [(feature_names[i], topic[i] / tot_score) for i in topic.argsort()[:-n_top_words - 1:-1]] sk_topics.append([topic_idx, top_words]) return sk_topics feature_names = vectorizer.get_feature_names() sklearn_topics = sklearn_show_topics(lda_sklearn, feature_names, 10) topic_words['gensim'] = gensim_topics
import gensim import json from gensim.corpora import Dictionary from gensim.models import LdaMulticore input_dict_fname = '../outputs/20news_18828/output_text_preprocessed.json' # Retrive gensim corpus & dictionary data... json_word_list = None with open(input_dict_fname, "r") as f: json_word_list = json.loads(f.read()) json_word_list = list(json_word_list.values()) print(json_word_list[1]) dictionary = Dictionary(json_word_list) corpus = [dictionary.doc2bow(x) for x in json_word_list] newsdata_topics = LdaMulticore(corpus, id2word=dictionary, num_topics=10) print(newsdata_topics.show_topics(num_words=5))
from wordcloud import WordCloud, STOPWORDS import matplotlib.colors as mcolors cols = [color for name, color in mcolors.TABLEAU_COLORS.items() ] # more colors: 'mcolors.XKCD_COLORS' cloud = WordCloud(stopwords=stop_words, background_color='white', width=1200, height=1200, max_words=10, colormap='tab10', color_func=lambda *args, **kwargs: cols[i], prefer_horizontal=1.0) topics = lda_model.show_topics(formatted=False) fig, axes = plt.subplots(2, 3, figsize=(16, 10), sharex=True, sharey=True) for i, ax in enumerate(axes.flatten()): fig.add_subplot(ax) topic_words = dict(topics[i][1]) cloud.generate_from_frequencies(topic_words, max_font_size=300) plt.gca().imshow(cloud) plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16), y=1.05) plt.gca().axis('off') plt.subplots_adjust(wspace=.3, hspace=.2) plt.axis('off') plt.margins(x=0, y=0) #plt.tight_layout()
tfidf_corpus.append(tfidf[doc]) tfidf_mat = matutils.corpus2dense(tfidf_corpus, num_terms=len(id2word.token2id)) tfidf_mat_transpose = tfidf_mat.transpose() dfTFIDF = pd.DataFrame( data=tfidf_mat_transpose[0:, 0:], index=[i for i in range(tfidf_mat_transpose.shape[0])], columns=['' + str(i) for i in range(tfidf_mat_transpose.shape[1])]) dfTFIDF['id'] = ids.tolist() ef.deleteIndex(credentials, "tfidf") ef.saveTFIDF(credentials, dfTFIDF) # Keyword weights x = lda_model.show_topics(num_topics=args.number_of_topics, num_words=50, formatted=False) keywordWeights = [] topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x] for tp in x: words = [] weights = [] for pair in tp[1]: words.append(pair[0]) weights.append(int(pair[1] * 10000)) keywordWeights.append(weights) # Top topics per paragraph df = pd.DataFrame() df['referenceId'] = referenceIds df['paragraph'] = raw_paragraphs
chunksize=1000, batch=False, alpha='asymmetric', decay=0.5, offset=64, eta=None, eval_every=0, iterations=100, gamma_threshold=0.001, per_word_topics=True) #get document topic distribution: doc_topic_dist = get_corpus_topics(_texts, lda_model) #print(lda_model.show_topics(num_words=20)) topic_terms = lda_model.show_topics(num_words=50) #get top words for each topic: topic_term_dict = {} rel_terms = [] for topic_dist in topic_terms: topic_id = topic_dist[0] topic_term_dict[topic_id] = {} topic_terms = topic_dist[1] for _split in topic_terms.split('+'): topic_term_prob = _split.split('*')[0] topic_term = str(_split.split('*')[1]).replace('"', '').strip() topic_term_dict[topic_id][topic_term] = float(topic_term_prob) #rel_terms.append(topic_term) #print(topic_term_dict) picked_sentences[key] = {}
class GensimMalletTopicExtractor: def __init__(self, language='english', stopwords_extent=None): self.language2la = { 'english': 'en', 'french': 'fr', 'spanish': 'es' } if language not in self.language2la: raise ValueError('Language must be "english", "french" or "spanish"') self.language = language self.stop_words = stopwords.words(self.language) if stopwords_extent is str or stopwords_extent is list: self.stop_words.extend(stopwords_extent) self.df_topic_sents_keywords = None self.bigram = None self.bigram_phraser = None self.trigram = None self.trigram_phraser = None self.vis = None self.data = None self.data_words = None self.data_words_nostops = None self.data_words_bigrams = None self.data_words_trigrams = None self.nlp = None self.data_lemmatized = None self.id2word = None self.texts = None self.corpus = None self.mallet_path = None self.lda_model = None self.coherence_model_lda = None self.coherence_lda = None self.coherence_values = [] self.model_list = [] self.optimal_number_of_topics = None self.optimal_model = None self.optimal_topics = None @staticmethod def sent_to_words(sentences, remove_punctuation=True): for sentence in sentences: # deacc=True removes punctuations yield(simple_preprocess(str(sentence), deacc=remove_punctuation)) def remove_stopwords(self, texts): return [[word for word in simple_preprocess(str(doc)) if word not in self.stop_words] for doc in texts] def make_bigrams(self, texts): self.bigram = Phrases(self.data_words, min_count=5, threshold=100) self.bigram_phraser = Phraser(self.bigram) return [self.bigram_phraser[doc] for doc in texts] def make_trigrams(self, texts): tokens_ = self.bigram_phraser[texts] self.trigram = Phrases(tokens_, threshold=100) self.trigram_phraser = Phraser(self.trigram) return [self.trigram_phraser[self.bigram_phraser[doc]] for doc in texts] def lemmatization(self, texts, allowed_postags=None): if allowed_postags is None: allowed_postags = ['NOUN', 'ADJ', 'VERB', 'ADV'] """https://spacy.io/api/annotation""" texts_out = [] for sent in texts: doc = self.nlp(" ".join(sent)) texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) return texts_out def view_terms_frequency(self, text_id, first_words=20): # Human readable format of corpus (term-frequency) list_ = [[(self.id2word[id_], freq) for id_, freq in text[:first_words]] for text in self.corpus[text_id]] pprint(list_) def visualize_lda(self): # Visualize the topics # pyLDAvis.enable_notebook() self.vis = pyLDAvis.gensim.prepare(self.lda_model, self.corpus, self.id2word) print(self.vis) def instanciate_model(self, num_topics, passes, iterations, enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False): if enable_mallet is True: # Download File: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip os.environ.update({'MALLET_HOME': r'C:/mallet-2.0.8/'}) self.mallet_path = 'C:\\mallet-2.0.8\\bin\\mallet' # update this path self.lda_model = LdaMallet(self.mallet_path, corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, iterations=iterations, optimize_interval=optimize_interval, topic_threshold=topic_threshold) print('Mallet LDA model built\n') if show_topics_on_creation is True: pprint(self.lda_model.show_topics(formatted=False)) else: self.lda_model = LdaMulticore(corpus=self.corpus, id2word=self.id2word, num_topics=num_topics, random_state=100, chunksize=500, passes=passes, iterations=iterations, per_word_topics=True) print('LDA_MultiCore model built\n') if show_topics_on_creation is True: pprint(self.lda_model.print_topics()) def extract_topics(self, data, num_topics, passes=10, iterations=500, enable_mallet=True, optimize_interval=0, topic_threshold=0.0): self.data = data print('\nEXTRACTING ' + str(num_topics) + ' TOPICS') self.data_words = list(self.sent_to_words(self.data, True)) # Remove Stop Words print('\nRemoving stopwords') self.data_words_nostops = self.remove_stopwords(self.data_words) # Form Bigrams print('Looking for bigrams') self.data_words_bigrams = self.make_bigrams(self.data_words_nostops) # Form Trigrams print('Looking for trigrams') self.data_words_trigrams = self.make_trigrams(self.data_words_nostops) # Initialize spacy 'en' model, keeping only tagger component (for efficiency) # python3 -m spacy download en print('Loading Spacy with ' + self.language + ' dictionary') self.nlp = spacy.load(self.language2la[self.language], disable=['parser', 'ner']) # Do lemmatization keeping only noun, adj, vb, adv print('Lemmatizing') self.data_lemmatized = self.lemmatization(self.data_words_trigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) # Create Dictionary print('Creating dictionary') self.id2word = corpora.Dictionary(self.data_lemmatized) # Create Corpus print('Creating corpus') self.texts = self.data_lemmatized # Term Document Frequency print('Computing document frequency') self.corpus = [self.id2word.doc2bow(text) for text in self.texts] # Build LDA model print('\nEnable_mallet is', enable_mallet, '\n') self.instanciate_model(num_topics, passes, iterations, enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=True) # print(self.lda_model[self.corpus]) # Compute Perplexity # a measure of how good the model is. lower the better. if hasattr(self.lda_model, 'log_perplexity'): print('\nPerplexity: ', self.lda_model.log_perplexity(self.corpus)) # Compute Coherence Score print('\nComputing coherence model') self.coherence_model_lda = CoherenceModel(model=self.lda_model, texts=self.data_lemmatized, dictionary=self.id2word, coherence='c_v') print('Getting coherence') self.coherence_lda = self.coherence_model_lda.get_coherence() print('\nCoherence Score: ', self.coherence_lda) if enable_mallet is False: self.visualize_lda() def view_optimal_topics(self, num_words=20): pprint(self.optimal_model.print_topics(num_words=num_words)) def compute_coherence_values(self, limit, start=2, step=3, passes=10, iterations=500, enable_mallet=True, optimize_interval=0, topic_threshold=0.0): """ Compute c_v coherence for various number of topics Parameters: ---------- limit : Max num of topics Returns: ------- model_list : List of LDA topic models coherence_values : Coherence values corresponding to the LDA model with respective number of topics """ for num_topics in range(start, limit, step): print('\n' + '*'*10 + ' COMPUTING COHERENCE FOR ' + str(num_topics) + ' TOPICS ' + '*'*10) self.instanciate_model(num_topics, passes, iterations, enable_mallet, optimize_interval, topic_threshold, show_topics_on_creation=False) self.model_list.append(self.lda_model) coherence_model = CoherenceModel(model=self.lda_model, texts=self.data_lemmatized, dictionary=self.id2word, coherence='c_v') self.coherence_values.append(coherence_model.get_coherence()) # Show graph x = range(start, limit, step) plt.plot(x, self.coherence_values) plt.xlabel("Num Topics") plt.ylabel("Coherence score") plt.legend("coherence_values", loc='best') plt.show() # Print the coherence scores for m, cv in zip(x, self.coherence_values): print("Num Topics =", m, " has Coherence Value of", round(cv, 4)) optimal_model_index = self.coherence_values.index(max(self.coherence_values)) self.optimal_number_of_topics = start + optimal_model_index self.optimal_model = self.model_list[optimal_model_index] print('\nOptimal number of topics is ' + str(self.optimal_number_of_topics) + ' with coherence score : ' + str(self.coherence_values[optimal_model_index])) self.optimal_topics = self.optimal_model.show_topics(num_topics=self.optimal_number_of_topics, num_words=20, formatted=False) self.view_optimal_topics() def format_topics_sentences(self, ldamodel=None): if ldamodel is None and self.optimal_model is not None: ldamodel = self.optimal_model elif ldamodel is None and self.lda_model is not None: ldamodel = self.lda_model # Init output sent_topics_df = pd.DataFrame() # Get main topic in each document for i, row in enumerate(ldamodel[self.corpus]): row = sorted(row, key=lambda x: (x[1]), reverse=True) # Get the Dominant topic, Perc Contribution and Keywords for each document for j, (topic_num, prop_topic) in enumerate(row): if j == 0: # => dominant topic wp = ldamodel.show_topic(topic_num) topic_keywords = ", ".join([word for word, prop in wp]) sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]), ignore_index=True) else: break sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'] # Add original text to the end of the output contents = pd.Series(self.data) sent_topics_df = pd.concat([sent_topics_df, contents], axis=1) return sent_topics_df def get_most_representative_documents(self): # Group top 5 sentences under each topic sent_topics_sorteddf_mallet = pd.DataFrame() if self.df_topic_sents_keywords is None: self.df_topic_sents_keywords = self.format_topics_sentences() # Format df_dominant_topic = self.df_topic_sents_keywords.reset_index() df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text'] sent_topics_outdf_grpd = self.df_topic_sents_keywords.groupby('Dominant_Topic') for i, grp in sent_topics_outdf_grpd: sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], axis=0) # Reset Index sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True) # Format sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"] # Show sent_topics_sorteddf_mallet.head() for i in range(len(sent_topics_sorteddf_mallet)): print(i, sent_topics_sorteddf_mallet.loc[i, 'Text']) def get_topic_distribution(self): if self.df_topic_sents_keywords is None: self.df_topic_sents_keywords = self.format_topics_sentences() # Number of Documents for Each Topic topic_counts = self.df_topic_sents_keywords['Dominant_Topic'].value_counts() # Percentage of Documents for Each Topic topic_contribution = round(topic_counts/topic_counts.sum(), 4) # Topic Number and Keywords topic_num_keywords = self.df_topic_sents_keywords[['Dominant_Topic', 'Topic_Keywords']] # Concatenate Column wise df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis=1) # Change Column names df_dominant_topics.columns = ['Dominant_Topic', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents'] # Show print(df_dominant_topics)
def run(self, args): # mlflow logs experiment_name = "dev-LessonsClustering" if args.environment == "production": experiment_name = "LessonsClustering" elif args.environment == "staging": experiment_name = "staging-LessonsClustering" mlflow.set_experiment(experiment_name) client = mlflow.tracking.MlflowClient() with mlflow.start_run(): log_param("environment", args.environment) log_param("mode", args.mode) log_param("update_related_lessons", args.update_related_lessons) # Get lessons data from database df = ef.getLessons(self.credentials) # Pre Processing lessonsData = df[df['isLesson'] == True] lessonsData = lessonsData[lessonsData['summary'] == lessonsData['summary']] raw_paragraphs = lessonsData['paragraph'] urls = lessonsData['urlToFile'] raw_sentences = raw_paragraphs ids = lessonsData['_id'] sentences = [line.split(' ') for line in raw_sentences] stop_words = stopwords.words('english') stop_words.extend( ['from', 'subject', 're', 'edu', 'use', 'äô', 'äù', 'äì']) words_to_remove = ['iii', 'project'] def remove_stopwords(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in stop_words ] for doc in texts] def remove_words(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in words_to_remove ] for doc in texts] def remove_word_length_2(texts): allSentences = [] for doc in texts: newWords = [] for word in doc: if len(word) > 2: newWords.append(word) allSentences.append(newWords) return allSentences def replace_adb_special_characters(texts): return [[ word.replace('‚Äôs', "'s ").replace('O‚ÄôSmach', "0").replace( 'äù', "").replace('äô', "").replace('äì', "") for word in doc ] for doc in texts] def get_wordnet_pos(word): tag = nltk.pos_tag([word])[0][1][0].upper() tag_dict = { "J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV } return tag_dict.get(tag, wordnet.NOUN) sentences = replace_adb_special_characters(sentences) data_words_nostops = remove_stopwords(sentences) lemmatizer = WordNetLemmatizer() lemmatized_output = [] for paragraph in data_words_nostops: lemmatized_output.append([ lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in paragraph ]) sentences = remove_words(lemmatized_output) sentences_no_length_2 = remove_word_length_2(sentences) sentences = sentences_no_length_2 id2word = corpora.Dictionary(sentences) texts = sentences corpus = [id2word.doc2bow(text) for text in texts] def compute_coherence_values(corpus, dictionary, k, a, b): lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=k, random_state=100, chunksize=100, passes=10, alpha=a, eta=b, per_word_topics=True) coherence_model_lda = CoherenceModel(model=lda_model, texts=sentences, dictionary=id2word, coherence='c_v') return coherence_model_lda.get_coherence() # Fine Tuning if args.mode == "fine_tuning": grid = {} grid['Validation_Set'] = {} # Topics range min_topics = 2 max_topics = args.max_number_of_topics step_size = 1 topics_range = range(min_topics, max_topics + 1, step_size) # Alpha parameter alpha = list(np.arange(0.01, 1, 0.3)) # alpha.append('symmetric') # alpha.append('asymmetric') # Beta parameter beta = list(np.arange(0.01, 1, 0.3)) # beta.append('symmetric') # Validation sets # num_of_docs = len(corpus) corpus_sets = [ # ClippedCorpus(corpus, int(num_of_docs*0.25)), # ClippedCorpus(corpus, int(num_of_docs*0.5)), # ClippedCorpus(corpus, int(num_of_docs*0.75)), corpus ] # corpus_title = [ # '25% Corpus' # '50% Corpus', # '75% Corpus' # '100% Corpus' # ] model_results = { # 'Validation_Set': [], 'Number Of Topics': [], 'Alpha': [], 'Beta': [], 'Coherence': [] } model_results_2 = { 'Number Of Topics': [], 'Average Coherence': [] } maxCoherence = 0 maxCoherenceK = 2 maxCoherenceA = 0.01 maxCoherenceB = 0.01 for i in range(len(corpus_sets)): for k in topics_range: for a in alpha: for b in beta: cv = compute_coherence_values( corpus=corpus_sets[i], dictionary=id2word, k=k, a=a, b=b) if cv > maxCoherence: maxCoherence = cv maxCoherenceK = k maxCoherenceA = a maxCoherenceB = b # model_results['Validation_Set'].append(corpus_title[i]) model_results['Number Of Topics'].append(k) model_results['Alpha'].append(a) model_results['Beta'].append(b) model_results['Coherence'].append(cv) customStep = int( str(k) + "{:.2f}".format(a).replace(".", "") + "{:.2f}".format(b).replace(".", "")) log_metric("coherence", cv, step=customStep) model_results_2['Number Of Topics'].append(k) model_results_2['Average Coherence'].append(cv) log_metric("average_coherence", cv, step=k) log_metric("max_coherence", maxCoherence) log_metric("number_of_topics_of_max_coherence", maxCoherenceK) log_metric("alpha_of_max_coherence", maxCoherenceA) log_metric("beta_of_max_coherence", maxCoherenceB) pd.DataFrame(model_results).to_csv(defaults.DATA_PATH + "fine-tuning.csv", index=False) pd.DataFrame(model_results_2).to_csv(defaults.DATA_PATH + "fine-tuning-2.csv", index=False) log_artifact(defaults.DATA_PATH + "fine-tuning.csv", "data/") log_artifact(defaults.DATA_PATH + "fine-tuning-2.csv", "data/") # Train LDA model elif args.mode == "train": log_metric("number_of_topics", args.number_of_topics) log_metric("alpha", args.alpha) log_metric("beta", args.beta) lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=args.number_of_topics, random_state=200, chunksize=100, passes=10, alpha=args.alpha, eta=args.beta, per_word_topics=True) cv = compute_coherence_values(corpus=corpus, dictionary=id2word, k=args.number_of_topics, a=args.alpha, b=args.beta) log_metric("coherence", cv) lda_model.save(defaults.MODEL_PATH + "lda.model") log_artifact(defaults.MODEL_PATH + "lda.model", "models/") # Predict LDA model elif args.mode == "predict": log_param("run_id_model", args.run_id_model) number_of_topics = int(args.number_of_topics) if not args.run_id_model == "": data = client.get_run(args.run_id_model).data number_of_topics = int(data.params['number_of_topics']) alpha = float(data.params['alpha']) beta = float(data.params['beta']) log_metric("number_of_topics", number_of_topics) log_metric("alpha", alpha) log_metric("beta", beta) cv = compute_coherence_values(corpus=corpus, dictionary=id2word, k=number_of_topics, a=alpha, b=beta) log_metric("coherence", cv) # Download and load the LDA model modelFilePath = defaults.MODEL_PATH + "lda.model" af.downloadLDAModel(args, modelFilePath) lda_model = LdaModel.load(modelFilePath) # lda_model.save(defaults.MODEL_PATH + "lda.model") # log_artifact(defaults.MODEL_PATH + "lda.model", "models/") # Keyword weights x = lda_model.show_topics(num_topics=number_of_topics, num_words=50, formatted=False) keywordWeights = [] topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x] for tp in x: words = [] weights = [] for pair in tp[1]: words.append(pair[0]) weights.append(int(pair[1] * 10000)) keywordWeights.append(weights) # Top topics per paragraph topicNumbers = [] for c in range(len(corpus)): maxProbability = 0 indexOfMax = 0 topTopics = [] topTopicProbabilities = [] lda_model.get_document_topics(corpus[c]) for topicNumber in lda_model.get_document_topics( corpus[c]): topTopics.append(topicNumber[0]) topTopicProbabilities.append(topicNumber[1]) topTopicsSorted = [ x for _, x in sorted(zip(topTopicProbabilities, topTopics), reverse=True) ] topicNumbers.append(topTopicsSorted) lessonsData['newTopTopics'] = topicNumbers lessonsData['topTopics'] = topicNumbers # Most probable topic per paragraph topTopics = [] for index, row in lessonsData.iterrows(): if (row['topTopics']): topTopics.append(row['topTopics'][0]) else: topTopics.append(-1) lessonsData['topic'] = topTopics # Frequencies of topic keywords and number of PCRs per topic topics = pd.DataFrame() topicKeywords = [] allKeywords = [] topicIds = [] for topic, words in topics_words: allKeywords.append(words) topicIds.append(topic) topics['key'] = topicIds topics['keywords'] = allKeywords topics['oldFrequencies'] = [[0] * len(keywords) for keywords in allKeywords] topics['numberOfLessons'] = 0 topics['PCRs'] = [[] for i in range(len(topics))] topics['numberOfPCRs'] = 0 for sentenceTopicNumbers, sentenceURL in zip( topicNumbers, urls): for topicNumber in sentenceTopicNumbers: topics.at[topicNumber, 'numberOfLessons'] = topics.at[ topicNumber, 'numberOfLessons'] + 1 topics.at[topicNumber, 'PCRs'].append(sentenceURL) for index, row in topics.iterrows(): topics.at[index, 'numberOfPCRs'] = len( set(topics.at[index, 'PCRs'])) topics = topics.drop(columns=['PCRs']) # Frequencies of words per sentence per topic topics['oldFrequencies'] = [[0] * len(keywords) for keywords in allKeywords] for index, row in topics.iterrows(): topicNumber = topics.at[index, 'key'] topicKeywords = topics.at[index, 'keywords'] topicKeywordsFrequencies = topics.at[index, 'oldFrequencies'] for sentence, sentenceTopicNumbers in zip( sentences, topicNumbers): for sentenceTopicNumber in sentenceTopicNumbers: if topicNumber == sentenceTopicNumber: for word in sentence: if word in topicKeywords: indexOfWord = topicKeywords.index(word) topicKeywordsFrequencies[ indexOfWord] = topicKeywordsFrequencies[ indexOfWord] + 1 topics.at[index, 'oldFrequencies'] = topicKeywordsFrequencies topics['frequencies'] = keywordWeights # Top word per topic topicTopWords = [] for index, row in topics.iterrows(): topicTopWords.append(row['keywords'][0]) topics['topWord'] = topicTopWords # Adjacent topics # pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word) topics['x'] = 1.0 topics['y'] = 1.0 for topic, x in zip(list(vis.topic_coordinates.index), list(vis.topic_coordinates.x)): topics.at[topic, 'x'] = float(x) for topic, y in zip(list(vis.topic_coordinates.index), list(vis.topic_coordinates.y)): topics.at[topic, 'y'] = float(y) import math def calculateDistance(x1, y1, x2, y2): dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2) return dist distanceMatrix = [] allDistances = [] c1 = 0 topicsX = topics['x'].tolist() topicsY = topics['y'].tolist() for tx1, ty1 in zip(topicsX, topicsY): distances = [] for tx2, ty2 in zip(topicsX, topicsY): distance = calculateDistance(tx1, ty1, tx2, ty2) if not distance: distance = 999 else: allDistances.append(distance) distances.append(distance) distanceMatrix.append(distances) c1 = c1 + 1 percentile20 = np.percentile(allDistances, 20) numberOfAdjacent = 0 numberOfNodes = len(distanceMatrix) allAdjacentTopics = [] for distances in distanceMatrix: adjacentTopics = [] for index, distance in zip(range(len(distances)), distances): if distance <= percentile20: adjacentTopics.append(index) allAdjacentTopics.append(adjacentTopics) numberOfAdjacent = numberOfAdjacent + len(adjacentTopics) numberOfAdjacent = numberOfAdjacent / 2 pairs = [] for index, adjacentTopicList in zip( range(len(allAdjacentTopics)), allAdjacentTopics): for adjacentTopic in adjacentTopicList: pairs.append(sorted([index, adjacentTopic])) pairs.sort() dedupedPairs = list(pairs for pairs, _ in itertools.groupby(pairs)) topWordPairs = [] for pair in dedupedPairs: topWordPairs.append( [topicTopWords[pair[0]], topicTopWords[pair[1]]]) topics['adjacentTopics'] = allAdjacentTopics # Save topics data ef.deleteIndex(self.credentials, "topics") ef.saveTopics(self.credentials, topics) # Lesson strength maxLessonStrength = topics['numberOfPCRs'].sum() lessonStrengths = [] for index, row in lessonsData.iterrows(): topicNumbers = row['topTopics'] lessonStrength = 0 for topicNumber in topicNumbers: lessonStrength = lessonStrength + topics.at[ topicNumber, 'numberOfPCRs'] lessonStrengths.append(lessonStrength / maxLessonStrength) lessonsData['lessonStrength'] = lessonStrengths # Save lessons data ef.updateSentences(self.credentials, lessonsData) mf.backupIndex(self.credentials, "sentences") mf.backupIndex(self.credentials, "topics") # Update related lessons # Get TFIDF model if args.update_related_lessons == "True": tfidf = TfidfModel(corpus, smartirs='ntc') tfidf_corpus = [] for doc in corpus: tfidf_corpus.append(tfidf[doc]) tfidf_mat = matutils.corpus2dense(tfidf_corpus, num_terms=len(id2word.token2id)) tfidf_mat_transpose = tfidf_mat.transpose() tfidfDF = pd.DataFrame( data=tfidf_mat_transpose[0:, 0:], index=[i for i in range(tfidf_mat_transpose.shape[0])], columns=[ '' + str(i) for i in range(tfidf_mat_transpose.shape[1]) ]) tfidfDF['id'] = ids.tolist() # Save related lessons cf.updateRelatedLessons(self.credentials, tfidfDF)
# Build another model using multicore LDA implementation and compare the coherence score from gensim.models import LdaMulticore ldamulticore = LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=id2word, workers=4, eval_every=None, passes=20, batch=True, per_word_topics=True) # In[ ]: # Display topics from pprint import pprint pprint(ldamulticore.show_topics(num_words=5, formatted=False)) # In[ ]: # Compute Coherence Score for the multicore model processed_data = pickle.load(open("processed_data_100_QAT.pkl", "rb")) coherence_model_ldamulticore = CoherenceModel(model=ldamulticore, texts=processed_data, dictionary=id2word, coherence='c_v') coherence_ldamulticore = coherence_model_ldamulticore.get_coherence() print('Coherence Score: ', coherence_ldamulticore) # In[ ]: # Build another model using LDA implementation and compare the coherence score with the two previous models
### Create BOW corpus ### corpus = [dictionary.doc2bow(text) for text in text_list] print("--- Corpus made: %s minutes ---" % round(((time.time() - start_time)/60),2)) start_lda_time = time.time() ################################# ######### Train LDA ############ ################################# lda_model = LdaMulticore(corpus, num_topics=4, id2word=dictionary, passes=150, workers = 3) final_topics = lda_model.show_topics() print("--- LDA trained : %s minutes ---" % round(((time.time() - start_lda_time)/60),2)) ################################# ##### Display WordCloud ######### ################################# curr_topic = 0 wc = WordCloud(background_color="black", max_words=2000,max_font_size=40, width=120, height=120, random_state=42) for line in final_topics: line = line[1] scores = [float(x.split("*")[0]) for x in line.split(" + ")] words = [x.split("*")[1] for x in line.split(" + ")] freqs = [] for word, score in zip(words, scores):