Beispiel #1
0
    def fit(self, df_original, topics):
        #Create Dictionary
        self.dictionary = self._create_dictionary(df_original)

        #Create corpus
        self.corpus = self._create_corpus(df_original)

        #Train Model
        hdp = HdpModel(self.corpus, id2word=self.dictionary, T=topics)
        self.model = hdp.suggested_lda_model()
        feature_vecs = []
        for i in range(len(self.corpus)):
            top_topics = self.model.get_document_topics(
                self.corpus[i], minimum_probability=0.0)
            topic_vec = [0] * topics
            for j in top_topics:
                index = j[0]
                topic_vec[index] = j[1]
            feature_vecs.append(topic_vec)

        df_lda_reduced = pd.DataFrame(feature_vecs,
                                      columns=list(range(len(
                                          feature_vecs[0]))))
        df_lda_reduced.insert(0, 'Name', list(df_original['Name'].values),
                              False)
        df_lda_reduced = df_lda_reduced.sort_values(by=['Name'])
        return df_lda_reduced
Beispiel #2
0
def test(rat, res, nt):

    global origCorpus

    corpus = origCorpus.copy()
    
    id_topic_ratio = rat
    resistance = res
    done = False
    numTops = nt
    
    topicPath = "data\\topics_init" + str(nt) + "_rat"+str(rat)+"_res_"+str(res)+".csv"
    relationPath = "data\\relations_init" + str(nt) + "_rat"+str(rat)+"_res_"+str(res)+".csv"

    topicFile = open(topicPath, 'w')
    topicOut =  csv.writer(topicFile, lineterminator = '\n')
    topicOut.writerow(["", "run", "topic", "terms", "p"])


    relationFile = open(relationPath, 'w')
    relationOut = csv.writer(relationFile, lineterminator = '\n')
    relationOut.writerow(["run", "topic", "no IDs", "ID/strength"])

    run = 1
    totalTopics = 0
    averageCoherence = 0
    badIDs = docIDs

    while not done:
        
        print("Run #" + str(run))
        
        doc2topic = dict()
        topic2doc = dict()
        
        
        oldIDs = badIDs.copy()
        badIDs = list()
        
        totalTopics += numTops
        
        #perform LDA
        hdp = HdpModel(corpus, dictionary, T=numTops)

        lda_model = hdp.suggested_lda_model()
        
        coherenceModel = CoherenceModel(model=lda_model, texts=data, dictionary=dictionary, coherence='c_v')
        coherence = coherenceModel.get_coherence()
        averageCoherence = ((totalTopics-numTops) * averageCoherence + numTops*coherence)/totalTopics
        
        # tag documents
        for ID in oldIDs:
            
            doc = docTokens[ID]
            vec = dictionary.doc2bow(doc)

            store = lda_model[vec]

            bestRel = 0

            # build relations
            for pair in store:
                
                bestRel = max(bestRel, pair[1])

                if pair[0] in topic2doc:
                    topic2doc[pair[0]] += [(ID, pair[1])]
                else:
                    topic2doc[pair[0]] = [(ID, pair[1])]

            # collect bad docs    
            if bestRel < resistance:

                badIDs.append(ID)
        
        
        #write terms
        
        top_words_per_topic = []
        for t in range(lda_model.num_topics):
            top_words_per_topic.extend([(run, t, ) + x for x in lda_model.show_topic(t, topn = 10)])

            
        terms = pd.DataFrame(top_words_per_topic, columns=['Run', 'Topic', 'Word', 'P']).to_csv(topicPath, mode='a', header=False)
        
        
        # print relations
        for topic in topic2doc:
            relationOut.writerow([run, topic, len(topic2doc[topic])]+ sorted(topic2doc[topic], key=operator.itemgetter(1), reverse=True))
        
        
        
        # done?
        if len(badIDs) == 0:
            done = True
            print("Done!")
        
        # if not, build new corpus
        else:
            print("Remaining: " + str(len(badIDs)))
            corpus = [dictionary.doc2bow(docTokens[docID]) for docID in badIDs]
            len(corpus)
            numTops = math.ceil(len(badIDs) * id_topic_ratio)
            run += 1

    results.writerow([nt, rat, res, averageCoherence, totalTopics])
        
    topicFile.close()
    relationFile.close()