def fit(self, df_original, topics): #Create Dictionary self.dictionary = self._create_dictionary(df_original) #Create corpus self.corpus = self._create_corpus(df_original) #Train Model hdp = HdpModel(self.corpus, id2word=self.dictionary, T=topics) self.model = hdp.suggested_lda_model() feature_vecs = [] for i in range(len(self.corpus)): top_topics = self.model.get_document_topics( self.corpus[i], minimum_probability=0.0) topic_vec = [0] * topics for j in top_topics: index = j[0] topic_vec[index] = j[1] feature_vecs.append(topic_vec) df_lda_reduced = pd.DataFrame(feature_vecs, columns=list(range(len( feature_vecs[0])))) df_lda_reduced.insert(0, 'Name', list(df_original['Name'].values), False) df_lda_reduced = df_lda_reduced.sort_values(by=['Name']) return df_lda_reduced
def test(rat, res, nt): global origCorpus corpus = origCorpus.copy() id_topic_ratio = rat resistance = res done = False numTops = nt topicPath = "data\\topics_init" + str(nt) + "_rat"+str(rat)+"_res_"+str(res)+".csv" relationPath = "data\\relations_init" + str(nt) + "_rat"+str(rat)+"_res_"+str(res)+".csv" topicFile = open(topicPath, 'w') topicOut = csv.writer(topicFile, lineterminator = '\n') topicOut.writerow(["", "run", "topic", "terms", "p"]) relationFile = open(relationPath, 'w') relationOut = csv.writer(relationFile, lineterminator = '\n') relationOut.writerow(["run", "topic", "no IDs", "ID/strength"]) run = 1 totalTopics = 0 averageCoherence = 0 badIDs = docIDs while not done: print("Run #" + str(run)) doc2topic = dict() topic2doc = dict() oldIDs = badIDs.copy() badIDs = list() totalTopics += numTops #perform LDA hdp = HdpModel(corpus, dictionary, T=numTops) lda_model = hdp.suggested_lda_model() coherenceModel = CoherenceModel(model=lda_model, texts=data, dictionary=dictionary, coherence='c_v') coherence = coherenceModel.get_coherence() averageCoherence = ((totalTopics-numTops) * averageCoherence + numTops*coherence)/totalTopics # tag documents for ID in oldIDs: doc = docTokens[ID] vec = dictionary.doc2bow(doc) store = lda_model[vec] bestRel = 0 # build relations for pair in store: bestRel = max(bestRel, pair[1]) if pair[0] in topic2doc: topic2doc[pair[0]] += [(ID, pair[1])] else: topic2doc[pair[0]] = [(ID, pair[1])] # collect bad docs if bestRel < resistance: badIDs.append(ID) #write terms top_words_per_topic = [] for t in range(lda_model.num_topics): top_words_per_topic.extend([(run, t, ) + x for x in lda_model.show_topic(t, topn = 10)]) terms = pd.DataFrame(top_words_per_topic, columns=['Run', 'Topic', 'Word', 'P']).to_csv(topicPath, mode='a', header=False) # print relations for topic in topic2doc: relationOut.writerow([run, topic, len(topic2doc[topic])]+ sorted(topic2doc[topic], key=operator.itemgetter(1), reverse=True)) # done? if len(badIDs) == 0: done = True print("Done!") # if not, build new corpus else: print("Remaining: " + str(len(badIDs))) corpus = [dictionary.doc2bow(docTokens[docID]) for docID in badIDs] len(corpus) numTops = math.ceil(len(badIDs) * id_topic_ratio) run += 1 results.writerow([nt, rat, res, averageCoherence, totalTopics]) topicFile.close() relationFile.close()