def get_lda_vis(clustering_pipeline): """generates topic-term 2D visualization using pyLDAvis Parameters ---------- clustering_pipeline : class reference The current modeling pipeling """ with st.spinner( "Loading visualization... Once ready, save the generated HTML file shown below." ): ldavis = clustering_pipeline.generate_ldavis() # tmp fix for LDAvis error https://stackoverflow.com/questions/47998685/pyldavis-validation-error-on-trying-to-visualize-topics. # not able to comment out line 375 of _prepare pyLDAvis _input_validate(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency) in dockerized version if ldavis == "This visualization is currently not available.": st.warning(ldavis) return st.markdown( "Set view to widescreen or open HTML in new tab for the best experience." ) ldavis_html = pyLDAvis.prepared_data_to_html(ldavis) b64 = base64.b64encode(ldavis_html.encode()).decode( ) # some strings <-> bytes conversions necessary here href = f'<a href="data:text/html;base64,{b64}">Download HTML File</a> \ (right-click and save as <some_name>.html)' st.markdown(href, unsafe_allow_html=True) iframe = f'<iframe width="100%" height="900" src="data:text/html;base64,{b64}">The “iframe” tag is not supported by your browser.</iframe>' st.write(iframe, unsafe_allow_html=True)
def pipeline(self, model, corpus, dictionary, K): viz = pyLDAvis.gensim.prepare(model, corpus, dictionary) html = pyLDAvis.prepared_data_to_html(viz, template_type="general") self.viz = viz self.html = html self.K = K self.save_html()
def get_topic_by_lda(dictionary_list, number_topics=5, ldavis_url=None, ldavis_css_url=None): dictionary = corpora.Dictionary(dictionary_list) dictionary.filter_extremes(no_below=0, no_above=1.0) corpus = [dictionary.doc2bow(text) for text in dictionary_list] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_topics, id2word=dictionary, passes=20) coherence_model_object = CoherenceModel(model=ldamodel, corpus=corpus, texts=dictionary_list, dictionary=dictionary, coherence='c_v') coherence_score = coherence_model_object.get_coherence() topic_list = ldamodel.show_topics(num_topics=number_topics, num_words=30, formatted=False) data_prepared_object = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, n_jobs=1) formatted_html = pyLDAvis.prepared_data_to_html( data_prepared_object, ldavis_url=ldavis_url, ldavis_css_url=ldavis_css_url) return formatted_html, topic_list, coherence_score
def lda_model(data, corpus, dictionary, num_topics): lda = models.LdaModel(corpus, id2word=dictionary, num_topics=num_topics) # Visualize topics with pyLDAvis lda_data = gensimvis.prepare(lda, corpus, dictionary) html_string = pyLDAvis.prepared_data_to_html(lda_data) components.v1.html(html_string, width=1280, height=1024) # Visualize documents w/ t-SNE visualize_topics(data, corpus, lda, num_topics)
def __render_model(self, model, corpus, dict, ntopics): data = pyLDAvis.gensim.prepare(model, corpus, dict) div_id = "pyldavis" html = pyLDAvis.prepared_data_to_html(data, template_type="simple", visid=div_id) found = '!function(LDAvis){' + re.search( r"\!function\(LDAvis\)\{(.+?)\}\(LDAvis\)\;", html, re.MULTILINE | re.DOTALL).group(1) + '}(LDAvis);' #print("Found->",found) return found
def topicmodel_forproyect(id_proyect): df_comments = get_data(id_proyect) #list_mask=np.unique(df_comments.project_id) #mask = df_comments["project_id"] == id_proyect #df2 = pd.read_excel("datos_congresista_virtual.xlsx", sheet_name="clasificaciones") num_topics = 5 df2 = df_comments.body df2 = df2.str.lower() pattern = r"@([A-Za-z0-9_]+)" df2 = df2.str.replace(pattern, '') elements = np.array(df2.tolist()) tokenizer = RegexpTokenizer(r'\w+') es_stop = get_stop_words('es') p_stemmer = PorterStemmer() texts = [] print(str(id_proyect)) for i in elements: raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in es_stop] #stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stopped_tokens) #texts.append(stemmed_tokens) print(i) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=20) #ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word = dictionary, distributed=True, passes=20) try: ldamodel = gensim.models.ldamulticore.LdaMulticore( corpus, num_topics=num_topics, id2word=dictionary, passes=20) except ValueError: return "Coleccion Vacia. Aparentemente parametros faltantes o mal ingresados." import pyLDAvis.gensim import pyLDAvis vis_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) pyLDAvis.display(vis_data) return pyLDAvis.prepared_data_to_html(vis_data)
def corp_eval(dictionary, tokens, corpus, q_count, num_of_topics): """Evaluate the corpus and produce gensim visualization.""" i = len(tokens) lda = gensim.models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=num_of_topics, passes=1, alpha='symmetric', eta=None) corpus = [dictionary.doc2bow(token) for token in tokens] logging.debug(dictionary.token2id) logging.debug(viewitems(dictionary.dfs)) print(Fore.GREEN + "Producing LDA analysis for question: ", q_count, Style.RESET_ALL) print(lda) vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary) print(Fore.YELLOW + "These are the current topics: " + Style.RESET_ALL) print(lda.print_topics(i)) print( Fore.CYAN + "Opening up visualization in a new tab in the browser...", Style.RESET_ALL) # Writing HTML of visualization to file instead of showing with pyLDAvis show function # because the show function starts a server, which allows only one file to be displayed # at once. vis_html_text = pyLDAvis.prepared_data_to_html(vis) vis_html_file_name = "vis" + str(q_count) + ".html" vis_html_file = open(vis_html_file_name, "w") vis_html_file.write(vis_html_text) # Getting path to the edurate_gensim.py module, which is in the same directory # as the HTML file. This path will be used to generate the file path to the HTML # that is to be displayed. MODULE_NAME = "edurate_gensim.py" PATH_TO_MODULE = inspect.stack()[0][1] # Removing name of module from path so that the path only includes up to the # directory where the HTML file is located. PATH_TO_HTML = PATH_TO_MODULE[:-len(MODULE_NAME)] webbrowser.open("file:///" + PATH_TO_HTML + vis_html_file_name, new=2) logging.info("Gensim visualization has been displayed.") return dictionary.dfs
def pyldavis_run(lda_model_path, document_term_matrix_path, vectorizer_path): ''' Computes the pyLDAvis visualisation of the LDA model. Parameters ---------- lda_model_ath : str Path of the pickle object (serialised python object) of the LDA model. This is created in the lda_tsne_model2.py module. document_term_matrix_path : str Path of the pickle object (serialised python object) of the document-term matrix which is created using the CountVectorizer in the lda_tsne_model2.py module. vectorizer_path : str Path of the pickle object (serialised python object) of the vectorizer used to create the document-term matrix.This is usually the CountVectorizer in the lda_tsne_model2.py module. Returns ---------- Embedded html pyldavis visulisation of the LDA model. ''' t0 = time.time() # loading the pickle objects from the paths parameters. lda_model = pickle.load(open(lda_model_path, "rb")) document_term_matrix = pickle.load(open(document_term_matrix_path, "rb")) cvectorizer = pickle.load(open(vectorizer_path, "rb")) #prepares the pyldavis visualisation. There is a choice of dimensionality reduction methods here, TSNE is chosen as it is consistent #with the previous analysis in the lda_tsne_model2.py module and has shown to yield better results than other available methods. prepared_data = prepare(lda_model, document_term_matrix, cvectorizer, mds='tsne', plot_opts={ 'xlab': '', 'ylab': '' }) html = pyLDAvis.prepared_data_to_html(prepared_data) t1 = time.time() print("time for pyldavis: " + str(t1 - t0), file=sys.stdout) return html
def get_topic_by_lda(dictionary_list, number_topics=5, ldavis_url=None, ldavis_css_url=None): dictionary = corpora.Dictionary(dictionary_list) dictionary.filter_extremes(no_below=0, no_above=1.0) corpus = [dictionary.doc2bow(text) for text in dictionary_list] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=number_topics, id2word=dictionary, passes=20) data_prepared_object = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, n_jobs=1) formatted_html = pyLDAvis.prepared_data_to_html( data_prepared_object, ldavis_url=ldavis_url, ldavis_css_url=ldavis_css_url) return formatted_html
def ldavis(self, input_files, param, tool_id): data_to_return = {"data": {}} ok_to_process = False # Check the tool needs # ----- if "d-model-corpus" in input_files and "d-dictionary-corpus" in input_files and "d-gensimldamodel" in input_files: ok_to_process = len(input_files["d-model-corpus"]) and len( input_files["d-dictionary-corpus"]) and len( input_files["d-gensimldamodel"]) if not ok_to_process: res_err = {"data": {}} res_err["data"]["error"] = "Input data missing!" return res_err corpus = [] for file_k in input_files["d-model-corpus"]: for d in input_files["d-model-corpus"][file_k]: corpus.append(d["value"]) dictionary = None for file_k in input_files["d-dictionary-corpus"]: dictionary = input_files["d-dictionary-corpus"][file_k] ldamodel = None for file_k in input_files["d-gensimldamodel"]: ldamodel = input_files["d-gensimldamodel"][file_k] # Params # ----- # NO PARAMS vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary, sort_topics=False) html_str = pyLDAvis.prepared_data_to_html(vis) data_to_return["data"]["d-ldavis-html"] = {"ldavis": html_str} return data_to_return
def main(): hyperparameters = get_hyperparameters() if len(sys.argv) > 1: args = vars(utils.parse_args()) args = {k: v for k, v in args.items() if v is not None} hyperparameters.update(args) wandb.init(project="bom-topic-modelling", config=hyperparameters) lm, corpus, dictionary = train(**hyperparameters) lm.save(os.path.join(wandb.run.dir, 'lda.model')) # topic difference heatmap mdiff, _ = lm.diff(lm, distance='jaccard', num_words=50) fig = px.imshow(mdiff, origin='lower', color_continuous_scale='RdBu_r') wandb.log({"topic_diff": fig}) # pyLDAvis vis = pyLDAvis.gensim.prepare(lm, corpus, dictionary) html = pyLDAvis.prepared_data_to_html(vis) wandb.log({"pyLDAvis": wandb.Html(html, inject=False)})
def topic_modelling(data): abstracts = [] for abstract in data: # Remove punctuation abstract = re.sub('[,\.!?]', '', abstract) # Remove numbers abstract = re.sub('[0-9]', '', abstract) # Convert the abstracts to lowercase abstract = abstract.lower() abstracts.append(abstract) # Splitting abstracts snnipets = [] for abstract in abstracts: if abstract != "abstract not available": length = len(abstract) index = 0 last_i = 0 n = 256 while index < length: i = abstract.rfind(". ", index, index + n) if i == -1 or i == index: i = index + n text = abstract[index:i + 2] index = i + 2 snnipets.append(text) # Creating LDA #number_topics = 5 tf_vectorizer = CountVectorizer(stop_words='english') tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params()) dtm_tfidf = tfidf_vectorizer.fit_transform(snnipets) lda_tfidf = LDA(random_state=0) lda_tfidf.fit(dtm_tfidf) # Visualizing LDA data = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer, mds='mmds') html = pyLDAvis.prepared_data_to_html(data, template_type="simple") return html
def show_vis(vis): # Writing HTML of visualization to file instead of showing with pyLDAvis show function # because the show function starts a server, which allows only one file to be displayed # at once. print( Fore.CYAN + "Opening up visualization in a new tab in the browser...", Style.RESET_ALL) vis_html_text = pyLDAvis.prepared_data_to_html(vis) vis_html_file_name = defaults.GENSIM_OUTPUT_FILENAME vis_html_file = open(vis_html_file_name, "a") vis_html_file.write(vis_html_text) # Getting path to the refl_gensim.py module, which is in the same directory # as the HTML file. This path will be used to generate the file path to the HTML # that is to be displayed. MODULE_NAME = "refl_gensim.py" PATH_TO_MODULE = inspect.stack()[0][1] # Removing name of module from path so that the path only includes up to the # directory where the HTML file is located. PATH_TO_HTML = PATH_TO_MODULE[:-len(MODULE_NAME)] webbrowser.open("file:///" + PATH_TO_HTML + "e/" + vis_html_file_name, new=2) logging.info("Gensim visualization has been displayed.") return
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None, coherence='u_mass', vis_time=0, seed=None): running_os = platform.system() is_os_64bit = platform.machine().endswith('64') if running_os == 'Linux': if is_os_64bit: dtm_filename = 'dtm-linux64' else: dtm_filename = 'dtm-linux32' elif running_os == 'Windows': if is_os_64bit: dtm_filename = 'dtm-win64.exe' else: dtm_filename = 'dtm-win32.exe' else: # Mac dtm_filename = 'dtm-darwin64' dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename) if running_os != 'Windows': bash_command = "chmod +x {}".format(dtm_path) os.system(bash_command) tokenized_doc = np.array(table[input_col]) num_doc = len(tokenized_doc) if time_slice is None: time_slice = [num_doc] elif sum(time_slice) != num_doc: raise_runtime_error("The sum of time slice list does not match the number of documents.") if vis_time < 0 or vis_time >= len(time_slice): raise_runtime_error("Invalid time parameter: {}".format(vis_time)) dictionary = corpora.Dictionary(tokenized_doc) corpus = [dictionary.doc2bow(text) for text in tokenized_doc] dtm_params = {"corpus": corpus, "id2word": dictionary, "time_slices": time_slice, "num_topics": num_topic, "lda_sequence_max_iter": max_iter, "model": 'dtm'} if seed is not None: dtm_params["rng_seed"] = seed dtm_model = DtmModel(dtm_path, **dtm_params) topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)] for t in range(len(time_slice))] topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time] timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)] columns = ["topic_{}".format(i + 1) for i in range(num_topic)] topic_table = pd.DataFrame(topic_time, columns=columns) topic_table['time'] = timeline topic_table = topic_table[['time'] + columns] prop_arr = dtm_model.gamma_ out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors( [{'0100': "Existing table contains Topic Column Name. Please choose again."}]) out_table[topic_name] = [item.argmax() + 1 for item in prop_arr] out_table['topic_distribution'] = prop_arr.tolist() coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))] if coherence == 'u_mass': coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence() for item in coherence_topic_arr] else: coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc, coherence='c_v').get_coherence() for item in coherence_topic_arr] doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time) prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False) html_result = plv.prepared_data_to_html(prepared_data) params = {'Input column': input_col, 'Topic column name': topic_name, 'Number of topics': num_topic, 'Number of words for each topic': num_topic_word, 'Maximum number of iterations': max_iter, 'Time slice': time_slice, 'Coherence measure': coherence, 'Time to visualize': vis_time} rb = BrtcReprBuilder() rb.addMD(strip_margin(""" | ## Dynamic Topic Modeling Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD(strip_margin(""" | ### Coherence for each period | {coh_arr} | | ### Parameters | {params} """.format(coh_arr=coh_arr, params=dict2MD(params)))) model = _model_dict('dtm_model') model['params'] = params model['dtm_model'] = dtm_model model['coherences'] = coh_arr model['corpus'] = corpus model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
def _gsdmm(table, input_col, topic_name='topic', K=10, alpha=0.1, beta=0.1, max_iter=50, num_topic_words=3): docs = np.array(table[input_col]) docs_set = [set(doc) for doc in docs] docs_preprocessed = [list(doc_set) for doc_set in docs_set] vocab_set = list(set.union(*docs_set)) vocab_size = len(vocab_set) # initialize and train a GSDMM model mgp = gsdmm_rwalk.MovieGroupProcess(K=K, alpha=alpha, beta=beta, n_iters=max_iter) topics = mgp.fit(docs_preprocessed, vocab_size) # generate topic table topic_word_count = mgp.cluster_word_distribution topic_words_raw = [[ind, _count_to_ratio_raw(word_count)] for ind, word_count in enumerate(topic_word_count) if word_count] topic_words = [[item[0]] + _gen_table(item[1], num_topic_words) for item in topic_words_raw] # reset topic ids nonempty_topic_indices = [item[0] for item in topic_words] reset_topic_ind = { old_ind: (new_ind + 1) for new_ind, old_ind in enumerate(nonempty_topic_indices) } topics = [reset_topic_ind[old_ind] for old_ind in topics] topic_words = [[reset_topic_ind[old_item[0]]] + old_item[1:] for old_item in topic_words] # generate output dataframes out_table = pd.DataFrame.copy(table, deep=True) if topic_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains the topic column name. Please choose another name." }]) out_table[topic_name] = topics columns = ['index', 'vocabularies_weights', 'vocabularies', 'weights'] topic_table = pd.DataFrame(topic_words, columns=columns) topic_table['weights'] = topic_table['weights'].apply(pd.to_numeric) # pyLDAvis if len(topic_words) == 1: html_result = None else: topic_words_dicts = [item[1] for item in topic_words_raw] topic_term_dists = [[ topic_words_dict.get(word, 0) for word in vocab_set ] for topic_words_dict in topic_words_dicts] num_docs = len(topics) num_topics = len(topic_words_raw) doc_topic_dists = np.zeros((num_docs, num_topics)) for doc_id, topic_id in enumerate(topics): doc_topic_dists[doc_id][topic_id - 1] = 1.0 doc_lengths = [len(doc) for doc in docs_preprocessed] vocab_count = functools.reduce( lambda dict_1, dict_2: { word: dict_1.get(word, 0) + dict_2.get(word, 0) for word in set(dict_1).union(dict_2) }, topic_word_count) term_frequency = [vocab_count.get(word) for word in vocab_set] prepared_data = pyLDAvis.prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab_set, term_frequency) html_result = pyLDAvis.prepared_data_to_html(prepared_data) # generate report params = { 'Input column': input_col, 'Topic column name': topic_name, 'K': K, 'Alpha': alpha, 'Beta': beta, 'Maximum number of iterations': max_iter, 'Number of words for each topic': num_topic_words } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## GSDMM Result | ### Summary | """)) if html_result is not None: rb.addHTML(html_result) rb.addMD(strip_margin(""" | """)) rb.addMD( strip_margin(""" | ### Final Number of Topics | {num_topics} | | ### Parameters | {params} """.format(num_topics=len(topic_words_raw), params=dict2MD(params)))) # create model model = _model_dict('lda_model') model['params'] = params model['gsdmm_model'] = mgp model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
def visualise_lda(self,lda_model,corpus,dct): vis = pyLDAvis.gensim.prepare(lda_model, corpus, dct) lda_html=pyLDAvis.prepared_data_to_html(vis) #data_path="/Users/ankitanand/Box/UB/Fall 2019/IR/Proj1/cooked/lda.html" #lda_html=pyLDAvis.save_html(vis,data_path) return lda_html
def buildsklearnselectedworks(so: SearchObject, bagsofsentences: list): """ see: http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py see also: https://nlpforhackers.io/topic-modeling/ CountVectorizer: max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. see: https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer#35615151 max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example: max_df = 0.50 means "ignore terms that appear in more than 50% of the documents". max_df = 25 means "ignore terms that appear in more than 25 documents". The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms. min_df is used for removing terms that appear too infrequently. For example: min_df = 0.01 means "ignore terms that appear in less than 1% of the documents". min_df = 5 means "ignore terms that appear in less than 5 documents". The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms. notes: maxfreq of 1 will give you a lot of excessively common words: 'this', 'that', etc. maxfreq of on the general issue of graphing see also: https://speakerdeck.com/bmabey/visualizing-topic-models https://de.dariah.eu/tatom/topic_model_visualization.html on the axes: https://stats.stackexchange.com/questions/222/what-are-principal-component-scores """ activepoll = so.poll vv = so.vectorvalues settings = { 'maxfeatures': vv.ldamaxfeatures, 'components': vv.ldacomponents, # topics 'maxfreq': vv. ldamaxfreq, # fewer than n% of sentences should have this word (i.e., purge common words) 'minfreq': vv.ldaminfreq, # word must be found >n times 'iterations': vv.ldaiterations, 'mustbelongerthan': vv.ldamustbelongerthan } activepoll.statusis('Running the LDA vectorizer') # Use tf (raw term count) features for LDA. ldavectorizer = CountVectorizer(max_df=settings['maxfreq'], min_df=settings['minfreq'], max_features=settings['maxfeatures']) ldavectorized = ldavectorizer.fit_transform(bagsofsentences) ldamodel = LatentDirichletAllocation(n_components=settings['components'], max_iter=settings['iterations'], learning_method='online', learning_offset=50., random_state=0) ldamodel.fit(ldavectorized) visualisation = ldavis.prepare(ldamodel, ldavectorized, ldavectorizer) # pyLDAvis.save_html(visualisation, 'ldavis.html') ldavishtmlandjs = pyLDAvis.prepared_data_to_html(visualisation) storevectorindatabase(so, ldavishtmlandjs) return ldavishtmlandjs
def defectsClustering(username, password, repo): g = Github(username, password) user = g.get_user() repository = g.get_repo(repo) repoName = repo Issues = repository.get_issues() commitIssues = [] try: from collections.abc import Callable # noqa except ImportError: from collections import Callable # noqa print("I am here") # Import Dataset df_issues = pd.read_csv('D:/CDAP/g-Codex/dataset.csv') #print(df.target_names.unique()) print(df_issues) # df.head() #df_issues=pd.DataFrame(commitIssues) #Get all to lowercase #df_issues = df_issues.apply(lambda x: x.lower()) #Remove punctuations df_issues.Issue = df_issues.Issue.apply( lambda x: x.translate(string.punctuation)) #print("step 1") # Convert to list data = df_issues.Issue.values.tolist() #Sent to word def sent_to_words(sentences): for sentence in sentences: yield (gensim.utils.simple_preprocess(str(sentence), deacc=True)) # Remove Emails data = [re.sub('\S*@\S*\s?', '', sent) for sent in data] # Remove new line characters data = [re.sub('\s+', ' ', sent) for sent in data] # Remove distracting single quotes data = [re.sub("\'", "", sent) for sent in data] data_words = list(sent_to_words(data)) # Build the bigram and trigram models bigram = gensim.models.Phrases( data_words, min_count=5, threshold=100) # higher threshold fewer phrases. trigram = gensim.models.Phrases(bigram[data_words], threshold=100) # Faster way to get a sentence clubbed as a trigram/bigram bigram_mod = gensim.models.phrases.Phraser(bigram) trigram_mod = gensim.models.phrases.Phraser(trigram) # See trigram example # print(trigram_mod[bigram_mod[data_words[0]]]) print("step 2") # Define functions for stopwords, bigrams, trigrams and lemmatization def remove_stopwords(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in stop_words ] for doc in texts] def make_bigrams(texts): return [bigram_mod[doc] for doc in texts] def make_trigrams(texts): return [trigram_mod[bigram_mod[doc]] for doc in texts] # def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): # """https://spacy.io/api/annotation""" # texts_out = [] # for sent in texts: # doc = nlp(" ".join(sent)) # texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags]) # return texts_out # Remove Stop Words data_words_nostops = remove_stopwords(df_issues.Issue) # Form Bigrams data_words_bigrams = make_bigrams(data_words_nostops) # Create Dictionary id2word = corpora.Dictionary(data_words_bigrams) # Create Corpus texts = data_words_bigrams # Term Document Frequency corpus = [id2word.doc2bow(text) for text in texts] # View id2word[0] # Human readable format of corpus (term-frequency) [[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]] #Testing for four topics # Build LDA model lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=id2word, num_topics=4, random_state=100, update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True) # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score: ', coherence_lda) # Print the Keyword in the 10 topics pprint(lda_model.print_topics()) doc_lda = lda_model[corpus] for idx, topic in lda_model.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) # Compute Perplexity print('\nPerplexity: ', lda_model.log_perplexity( corpus)) # a measure of how good the model is. lower the better. # Compute Coherence Score coherence_model_lda = CoherenceModel(model=lda_model, texts=data_words_bigrams, dictionary=id2word, coherence='c_v') coherence_lda = coherence_model_lda.get_coherence() print('\nCoherence Score for for five topics: ', coherence_lda) # Visualize the topics #pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) # pyLDAvis.show(vis, '192.168.8.100', port=8888, n_retries=5, local=False, open_browser=True, http_server= None) #pyLDAvis.save_html(vis,'kush.html') print('hi') ad = pyLDAvis.prepared_data_to_html(vis, template_type="general") print(ad) return ad
def model_page(): vis_html = pyLDAvis.prepared_data_to_html(vis) return render_template('model.html', vis_html=vis_html)
def get_lda(): tokenizer = RegexpTokenizer(r'\w+') # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # create sample documents doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother." doc_b = "My mother spends a lot of time driving my brother around to baseball practice." doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure." doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better." doc_e = "Health professionals say that brocolli is good for your health." doc_set2 = [i for i in range(1,10)] print(doc_set2) # compile sample documents into a list doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e] # list for tokenized documents in loop texts = [] # loop through document list for i in doc_set: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) tagged_tokens = nltk.pos_tag(tokens) print(tagged_tokens) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts.append(stemmed_tokens) print (tokens) print (stemmed_tokens) print ("--------------------------------") # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20) print (lda.show_topics()) import matplotlib matplotlib.use('qt5agg') import pyLDAvis.gensim as gensimvis import pyLDAvis vis_data = gensimvis.prepare(lda, corpus, dictionary) x = pyLDAvis.prepared_data_to_html(vis_data) #print (x) return x print("-------------------") '''
def hdp_model(corpus, dictionary): hdp = models.HdpModel(corpus, id2word=dictionary) hdp_data = gensimvis.prepare(hdp, corpus, dictionary) html_string = pyLDAvis.prepared_data_to_html(hdp_data) components.v1.html(html_string, width=1280, height=1024)
# generate TF-IDF, LDA model from gensim import models tfidf_model = models.TfidfModel(corpus) tfidf = tfidf_model[corpus] print("\n","=========== TF-IDF ============") # print first 10 elements of first document's tf-idf vector print("\n",tfidf.corpus[0][:10]) # print top 10 elements of first document's tf-idf vector print("\n",sorted(tfidf.corpus[0], key=lambda x: x[1], reverse=True)[:10]) # print token of most frequent element #print("\n",dictionary.get(13)) n_topics = 5 lda = models.ldamodel.LdaModel(tfidf, num_topics=n_topics, id2word=dictionary, passes=1) print("\n","=========== lda.show_topics() ============") #print(lda.show_topics()) print(lda.print_topics(num_topics=n_topics, num_words=10)) import matplotlib matplotlib.use('qt5agg') import pyLDAvis.gensim as gensimvis import pyLDAvis vis_data = gensimvis.prepare(lda, corpus, dictionary) x = pyLDAvis.prepared_data_to_html(vis_data) print (x)
def ldatopicgraphing(sentencetuples, workssearched, searchobject, headwordstops=True): """ a sentence tuple looks like: ('gr2397w001_ln_42', 'ποίῳ δὴ τούτων ἄξιον τὸν κόϲμον φθείρεϲθαι φάναι') see: http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py see also: https://nlpforhackers.io/topic-modeling/ CountVectorizer: max_df : float in range [0.0, 1.0] or int, default=1.0 When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). min_df : float in range [0.0, 1.0] or int, default=1 When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. see: https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer#35615151 max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example: max_df = 0.50 means "ignore terms that appear in more than 50% of the documents". max_df = 25 means "ignore terms that appear in more than 25 documents". The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms. min_df is used for removing terms that appear too infrequently. For example: min_df = 0.01 means "ignore terms that appear in less than 1% of the documents". min_df = 5 means "ignore terms that appear in less than 5 documents". The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms. notes: maxfreq of 1 will give you a lot of excessively common words: 'this', 'that', etc. maxfreq of on the general issue of graphing see also: https://speakerdeck.com/bmabey/visualizing-topic-models https://de.dariah.eu/tatom/topic_model_visualization.html on the axes: https://stats.stackexchange.com/questions/222/what-are-principal-component-scores :param sentencetuples: :param activepoll: :return: """ if headwordstops: stops = mostcommonwordsviaheadwords() else: stops = mostcommoninflectedforms() sentencetuples = [(a, removestopwords(b, stops)) for a, b in sentencetuples] activepoll = searchobject.poll vv = searchobject.vectorvalues settings = { 'maxfeatures': vv.ldamaxfeatures, 'components': vv.ldacomponents, # topics 'maxfreq': vv. ldamaxfreq, # fewer than n% of sentences should have this word (i.e., purge common words) 'minfreq': vv.ldaminfreq, # word must be found >n times 'iterations': vv.ldaiterations, 'mustbelongerthan': vv.ldamustbelongerthan } # not easy to store/fetch since you need both ldavectorizer and ldamodel # so we just store the actual graph... ldavishtmlandjs = checkforstoredvector(searchobject, 'lda') if not ldavishtmlandjs: sentencetuples = [ s for s in sentencetuples if len(s[1].strip().split(' ')) > settings['mustbelongerthan'] ] sentences = [s[1] for s in sentencetuples] sentencesaslists = [s.split(' ') for s in sentences] allwordsinorder = [ item for sublist in sentencesaslists for item in sublist if item ] activepoll.statusis('Finding all headwords') morphdict = getrequiredmorphobjects(set(allwordsinorder), furtherdeabbreviate=True) morphdict = convertmophdicttodict(morphdict) bagsofwordlists = buildwordbags(searchobject, morphdict, sentencesaslists) bagsofsentences = [' '.join(b) for b in bagsofwordlists] # print('bagsofsentences[:3]', bagsofsentences[3:]) activepoll.statusis('Running the LDA vectorizer') # Use tf (raw term count) features for LDA. ldavectorizer = CountVectorizer(max_df=settings['maxfreq'], min_df=settings['minfreq'], max_features=settings['maxfeatures']) ldavectorized = ldavectorizer.fit_transform(bagsofsentences) ldamodel = LatentDirichletAllocation( n_components=settings['components'], max_iter=settings['iterations'], learning_method='online', learning_offset=50., random_state=0) ldamodel.fit(ldavectorized) visualisation = ldavis.prepare(ldamodel, ldavectorized, ldavectorizer) # pyLDAvis.save_html(visualisation, 'ldavis.html') ldavishtmlandjs = pyLDAvis.prepared_data_to_html(visualisation) storevectorindatabase(searchobject, 'lda', ldavishtmlandjs) jsonoutput = ldatopicsgenerateoutput(ldavishtmlandjs, searchobject) return jsonoutput
year=2016 topic_num=10 argc=len(sys.argv) if(argc>1): year=int(sys.argv[1]) if(argc>2): topic_num=int(sys.argv[2]) if(argc>3): conference=sys.argv[3] relpath= conference+str(year) print conference,year, topic_num fname=relpath+'/papers' outfname=relpath+'papers' dictionary = gensim.corpora.Dictionary.load(fname+'.dict') corpus = gensim.corpora.MmCorpus(fname+'.mm') lda = gensim.models.ldamodel.LdaModel.load(fname+'_%d.model'%topic_num) pdata=pyLDAvis.gensim.prepare(lda, corpus, dictionary) p=pyLDAvis.prepared_data_to_html(pdata) with open(outfname+"_%d.html"%(topic_num),"w") as fp: print >>fp,"<h1> %s %d</h1>"%(conference.upper(),year) print >>fp,"topic num=%d"%topic_num print >>fp,p
def display_with_header(data, header): hdata = pyLDAvis.prepared_data_to_html(data) hheader = '<h1>%s</h1>' % (header) display(HTML(hheader+hdata))
def main(filename): global data_vectorized global lda_output global plot_df df = pd.read_csv(UPLOAD_FOLDER + '/' + filename) # CHANGE THIS df = df.sample(frac=0.2, replace=False, random_state=1) N_NGRAM_RANGE = 2 # CHANGE HERE my_additional_stop_words = pd.read_csv( r'C:\Users\noel.alexander\Documents\Fullstack\Topic Modelling\Stopwords\custom_stopwords.csv' ).values.flatten().tolist() #CHANGE THIS stop_words = text.ENGLISH_STOP_WORDS.union(my_additional_stop_words) data = df.content.values.tolist() # Remove Emails data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data] # Remove new line characters data = [re.sub(r'\s+', ' ', sent) for sent in data] # Remove distracting single quotes data = [re.sub("\'", "", sent) for sent in data] data_words = list(sent_to_words(data)) nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner']) # Do lemmatization keeping only Noun, Adj, Verb, Adverb data_lemmatized = lemmatization( n=nlp, texts=data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) vectorizer = CountVectorizer( analyzer='word', min_df=0.05, # ignore terms that appear in less than 5% of the documents stop_words=stop_words, # remove stop words lowercase=True, # convert all words to lowercase token_pattern='[a-zA-Z0-9]{3,}', # num chars > 3 ngram_range=(1, N_NGRAM_RANGE)) data_vectorized = vectorizer.fit_transform(data_lemmatized) space = { 'n_topics': hp.quniform("n_topics", 6, 10, 1), # search n_topics from 2-20 'learning_decay': hp.uniform('learning_decay', 0.5, 0.9), # search learning_decay from 0.5-0.9 } trials = Trials() best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=25, trials=trials) LEARNING_DECAY = best['learning_decay'] #0.84529 #best['learning_decay'] N_TOPICS = best['n_topics'] #9 #best['n_topics'] print('starting lda') # Build LDA Model lda_model = LatentDirichletAllocation( n_components=int(N_TOPICS), # number of topics learning_decay= LEARNING_DECAY, # control learning rate in the online learning method max_iter=10, # max learning iterations learning_method='online', # use mini-batch of training data batch_size=128, # n docs in each learning iter n_jobs=-1, # use all available CPUs ) lda_output = lda_model.fit_transform(data_vectorized) lda_output = lda_model.transform(data_vectorized) # column names topicnames = ["Topic" + str(i) for i in range(lda_model.n_components)] # index names docnames = ["Doc" + str(i) for i in range(len(data))] # Make the pandas dataframe df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames) # Get dominant topic for each document dominant_topic = np.argmax(df_document_topic.values, axis=1) df_document_topic['dominant_topic'] = dominant_topic # Apply Style df_document_topics = df_document_topic.head(15).style.applymap( color_green).applymap(make_bold) df_topic_distribution = df_document_topic['dominant_topic'].value_counts( ).reset_index(name="Num Documents") df_topic_distribution.columns = ['Topic Num', 'Num Documents'] df_topic_distribution['Percent of Total'] = round( df_topic_distribution['Num Documents'] / np.sum(df_topic_distribution['Num Documents'].values), 2) topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=15) # Topic - Keywords Dataframe df_topic_keywords = pd.DataFrame(topic_keywords) df_topic_keywords.columns = [ 'Word ' + str(i) for i in range(df_topic_keywords.shape[1]) ] df_topic_keywords.index = [ 'Topic ' + str(i) for i in range(df_topic_keywords.shape[0]) ] #pyLDAvis.enable_notebook() panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne') topics_dic = {} for i in range(int(N_TOPICS)): topics_dic[i] = 'topic ' + str(i) plot_df = pd.DataFrame({'topics': labels}) plot_df['topics'] = plot_df['topics'].map(topics_dic) labels = [] for doc in lda_output: labels.append(np.argmax(doc)) labels = np.array(labels) embedding = umap.UMAP(n_neighbors=100, min_dist=0.9).fit_transform(lda_output) plot_df['axis_1'] = embedding[:, 0] plot_df['axis_2'] = embedding[:, 1] html = pyLDAvis.prepared_data_to_html(panel) return html
def model_function(): #*********************************************************************** # Defining the different news categories categories = { 'Sports': [ 'football', 'ball', 'team', 'play', 'win', 'season', 'fan', 'run', 'scoore', 'athletics', 'spectator', 'competition', 'tennis', 'yard', 'game', 'fun', 'cricket', 'stadium', 'uefa', 'concacaf', 'player', 'game', 'referee' ], 'Medical': [ 'patient', 'study', 'slave', 'food', 'eat', 'pain', 'treatment', 'syndrome', 'therapy', 'medicine', 'health', 'doctor', 'diagnosis', 'clinical', 'biomedical' ], 'World News': [ 'israel', 'war', 'kill', 'soldier', 'attack', 'war', 'government', 'racism', 'internet', 'newpaper', 'journalism', 'telephone', 'earth', 'country', 'conflict', 'civil', 'military', 'peace', 'war', 'hurt', 'army' ], 'Religion': [ 'god', 'evidence', 'christian', 'believe', 'reason', 'faith', 'exist', 'bible', 'religion', 'judaism', 'cult', 'belief', 'theology', 'church', 'symbol', 'homosexuality', 'hell' ], 'Lifestyle': [ 'trending', 'fashion', 'entertainment', 'society', 'person', 'mode', 'lifestyles', 'casual', 'healthy', 'chic', 'cosmopolitan', 'popular', 'social', 'fashionable', 'celebrity', 'carpet', 'red', 'body', 'dress', 'business', 'workplace', 'fun', 'holiday', 'buy', 'living', 'hobbies', 'hipster' ], 'Culture': [ 'education', 'knowledge', 'learn', 'learning', 'literacy', 'urbanity', 'class', 'civility', 'ignorance', 'civilization', 'life', 'values', 'legacy', 'tradition', 'society', 'philosophy', 'religion', 'nationalism', 'art', 'music', 'ritual', 'concept', 'humanism', 'classical' ], 'Politics': [ 'government', 'diplomatic', 'law', 'political', 'politics', 'governance', 'republic', 'state', 'police', 'monarchy', 'democratic', 'federation', 'city', 'company', 'country', 'latin', 'uk', 'usa' ], 'Technology': [ 'videogame', 'xbox', 'play', 'station', 'video', 'smartphone', 'nintendo', 'shooter', 'mobile', 'sony', 'gaming', 'electronics', 'engineering', 'science', 'robot', 'robotics', 'internet', 'computer', 'industry', 'automation', 'technological', 'energy', 'device', 'devices', 'application', 'app', 'technology' ], 'Entertainment': [ 'television', 'film', 'movie', 'animation', 'comedy', 'cinema', 'media', 'show', 'circus', 'dance', 'concert', 'online', 'radio', 'party', 'ceremony', 'tourist' ], 'Food': [ 'nutrition', 'rice', 'nutrient', 'beef', 'meat', 'cook', 'cooking', 'seafood', 'cereal', 'fat', 'soup', 'pasta', 'butter', 'agriculture', 'meal', 'milk', 'animals', 'chicken', 'plant', 'energy', 'vegetarian', 'protein', 'vitamin', 'nutriment', 'aliment', 'fruit', 'vegetable', 'restaurant', 'restaurants', 'eat', 'kitchen', 'pizza', 'taste' ], } #*********************************************************************** # For eliminating unnecessary words add_stop_words = [ 'said', 'would', 'one', 'even', 'really', 'could', 'also' ] stop_words = stopwords.words('english') [stop_words.append(i) for i in add_stop_words] stop_words_set = set(stop_words) #*********************************************************************** # Preparing the texts #*********************************************************************** # Pulling Mongo Data myclient = MongoClient("mongodb://{}:5003/".format(mongo_server)) mydb = myclient["mydatabase"] mycol = mydb["prueba"] res = mycol.find({}, {"Text": 1, "Title": 1, "Link": 1, "Time": 1}) res_data_frame = pd.DataFrame(list(res)) textos = res_data_frame["Text"] names = res_data_frame["Title"] urls = res_data_frame["Link"] time = res_data_frame["Time"] texts = [] documents = [] for t in textos: string = ''.join(t.splitlines()) string = string.lower() word_tokens = word_tokenize(string) filtered_sentence = [w for w in word_tokens if not w in stop_words_set] documents.append(" ".join(filtered_sentence)) texts.append(filtered_sentence) #*********************************************************************** # Model training and graph representation tokenized_list = [simple_preprocess(doc) for doc in documents] mydict = corpora.Dictionary() mycorpus = [ mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list ] word_counts = [[(mydict[id], count) for id, count in line] for line in mycorpus] lda_model = gensim.models.ldamodel.LdaModel( corpus=mycorpus, id2word=corpora.Dictionary(tokenized_list), num_topics=len(categories), random_state=100, update_every=1, chunksize=100, passes=30, alpha='auto', per_word_topics=True) vis = pyLDAvis.gensim.prepare(lda_model, mycorpus, corpora.Dictionary(tokenized_list), n_jobs=2) graphhtml = pyLDAvis.prepared_data_to_html(vis) table_data = [] for topic in lda_model.print_topics(): topic_data = list(topic) cate = get_cat(topic, categories) table_data.append(topic_data) topic_data.append(cate) #******************************************************************** # DataFrame that tells the categories, # their ID's and the words that create them df_categories = pd.DataFrame(table_data, columns=['ID', 'Words', 'Categorie']) #******************************************************************** get_document_topics = lda_model.get_document_topics(mycorpus) news_classification = [] for n in range(len(get_document_topics)): for i in range(len(table_data)): if get_document_topics[n][0][0] == table_data[i][0]: news_classification.append([ get_document_topics[n][0][1], table_data[i][2], names[n], urls[n], time[n] ]) #******************************************************************** # DataFrame that tells the categories for each article and its url df_classification = pd.DataFrame( news_classification, columns=['Belonging', 'Classification', 'Title', 'Link', 'Time']) df_classification = df_classification.sort_values(by="Time", ascending=True) df_classification["Time"] = df_classification["Time"].apply( lambda x: x.ctime()) return [df_categories, df_classification, graphhtml]
def _lda4(table, input_col, topic_name='topic', num_voca=1000, num_topic=5, num_topic_word=10, max_iter=20, learning_method='online', learning_offset=10., random_state=None): # generate model corpus = np.array(table[input_col]) if isinstance(corpus[0], np.ndarray): tf_vectorizer = CountVectorizer(preprocessor=' '.join, stop_words='english', max_df=0.95, min_df=2, max_features=num_voca) else: tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=num_voca, stop_words='english') term_count = tf_vectorizer.fit_transform(corpus) tf_feature_names = tf_vectorizer.get_feature_names() if learning_method == 'online': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, learning_offset=learning_offset, random_state=random_state).fit(term_count) elif learning_method == 'batch': lda_model = LatentDirichletAllocation( n_components=num_topic, max_iter=max_iter, learning_method=learning_method, random_state=random_state).fit(term_count) else: raise_runtime_error("Please check 'learning_method'.") log_likelihood = lda_model.score(term_count) perplexity = lda_model.perplexity(term_count) # create topic table vocab_weights_list = [] vocab_list = [] weights_list = [] topic_term_prob = normalize(lda_model.components_, norm='l1') for vector in topic_term_prob: pairs = [] for term_idx, value in enumerate(vector): pairs.append((abs(value), tf_feature_names[term_idx])) pairs.sort(key=lambda x: x[0], reverse=True) vocab_weights = [] vocab = [] weights = [] for pair in pairs[:num_topic_word]: vocab_weights.append("{}: {}".format(pair[1], pair[0])) vocab.append(pair[1]) weights.append(pair[0]) vocab_weights_list.append(vocab_weights) vocab_list.append(vocab) weights_list.append(weights) topic_table = pd.DataFrame({ 'vocabularies_weights': vocab_weights_list, 'vocabularies': vocab_list, 'weights': weights_list }) topic_table['index'] = [idx + 1 for idx in topic_table.index] topic_table = topic_table[[ 'index', 'vocabularies_weights', 'vocabularies', 'weights' ]] # create output table doc_topic = lda_model.transform(term_count) out_table = pd.DataFrame.copy(table, deep=True) topic_dist_name = topic_name + '_distribution' if topic_name in table.columns or topic_dist_name in table.columns: raise BrighticsFunctionException.from_errors([{ '0100': "Existing table contains Topic Column Name. Please choose again." }]) out_table[topic_name] = [ doc_topic[i].argmax() + 1 for i in range(len(corpus)) ] out_table[topic_dist_name] = doc_topic.tolist() # pyLDAvis prepared_data = ldavis.prepare(lda_model, term_count, tf_vectorizer) html_result = pyLDAvis.prepared_data_to_html(prepared_data) # generate report params = { 'Input column': input_col, 'Topic column name': topic_name, 'Number of topics': num_topic, 'Number of words for each topic': num_topic_word, 'Maximum number of iterations': max_iter, 'Learning method': learning_method, 'Learning offset': learning_offset, 'Seed': random_state } rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## Latent Dirichlet Allocation Result | ### Summary | """)) rb.addHTML(html_result) rb.addMD( strip_margin(""" | | ### Log Likelihood | {log_likelihood} | | ### Perplexity | {perplexity} | | ### Parameters | {params} """.format(log_likelihood=log_likelihood, perplexity=perplexity, params=dict2MD(params)))) # create model model = _model_dict('lda_model') model['params'] = params model['lda_model'] = lda_model model['_repr_brtc_'] = rb.get() return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
nmf = NMF(n_components=nbr_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf) nmf_topics = nmf.transform(tfidf) lda = LatentDirichletAllocation(n_components=nbr_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(tf) lda_topics = lda.transform(tf) # Plot pretty LDA output lda_vis_data = pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer) lda_vis_data_html = pyLDAvis.prepared_data_to_html(lda_vis_data) pyLDAvis.show(lda_vis_data) # Assume top 10 words will be used per topic top_n = 10 total_rows = nbr_topics * top_n topic = [] topic_top_ten = [] topic_top_ten_scores = [] for tid, t in enumerate(lda.components_): topic.append([tid + 1] * top_n) topic_top_ten.append( [tf_feature_names[i] for i in t.argsort()[:-top_n - 1:-1]]) topic_top_ten_scores.append(t[t.argsort()[:-top_n - 1:-1]]) top_words = np.concatenate([ np.array(topic).reshape(total_rows, 1),
def visualize_lda_model(self, lda_model, corpus, id2word): data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word) html = pyLDAvis.prepared_data_to_html(data) return html
def model_params(clicks, topics, iterations, tags, gender, rank, rel, years): # Lists all triggered callbacks changed_id = [p['prop_id'] for p in dash.callback_context.triggered][0] # Only runs the model training if the button has been clicked if 'button' in changed_id: # Non-filtered data data = df # Filters the data based on user's choices if len(tags) != len(pos_tags): data = tm.filter_by_tag(df, tags) if gender != 'A': data = tm.filter_by_sex(data, gender) if len(rank) != len(rank_set): data = tm.filter_by_rank(data, rank) if len(rel) != len(rel_set): data = tm.filter_by_rel(data, rel) if years[0] is not min(years_set) or years[1] is not max(years_set): data = tm.filter_by_time(data, years) # Data preprocessing for the LDA model corpus, dictionary, docs, strings = tm.prepare_data(data) # Creates the LDA topic model model, top_topics = tm.train_lda(corpus, dictionary, topics, iterations) dominant_topics = tm.letter_topics(model, corpus, strings) letters_for_topics = tm.get_most_representative(dominant_topics) letters_per_topic = tm.letters_per_topic(dominant_topics) # Loop that creates a dataframe from the LDA top topics list i = 1 i = 1 topic_dict = {} for topic in top_topics: entries = [] for t in topic[0]: score = round(float(t[0]), 3) tmp = t[1] + ', ' + str(score) entries.append(tmp) topic_dict['Topic {}'.format(i)] = entries i += 1 dataframe = pd.DataFrame(topic_dict) cols = [{"name": i, "id": i} for i in dataframe.columns] cols2 = [{"name": i, "id": i} for i in letters_for_topics.columns[1:]] cols3 = [{"name": i, "id": i} for i in letters_per_topic.columns[1:]] # Creates the pyLDAvis visualisation of the LDA model vis_data = pyLDAvis.gensim.prepare(model, corpus, dictionary) html_vis = pyLDAvis.prepared_data_to_html(vis_data, template_type='general') return dataframe.to_dict('records'), cols, letters_for_topics.to_dict( 'records'), cols2, letters_per_topic.to_dict( 'records'), cols3, html_vis else: return no_update, no_update, no_update, no_update, no_update, no_update, no_update