Beispiel #1
0
    def train_lda(self, cache_path):
        print(cache_path)
        trainBatchIter = BatchIterBert(self.trainDataIter,
                                       filling_last_batch=False,
                                       postProcessor=batchPostProcessor,
                                       batch_size=1)
        bow_list = []
        for item in trainBatchIter:
            bow = item[1].squeeze().detach().numpy().tolist()
            bow_list.append(self.bow_2_gensim(bow))
        print(len(bow_list))
        #print(self.dictProcess.common_dictionary.id2token)
        lda = LdaModel(np.array(bow_list),
                       num_topics=50,
                       passes=200,
                       chunksize=len(bow_list),
                       id2word=self.dictProcess.common_dictionary)
        #print(lda.show_topic(1, topn=10))
        output_topic_line = ''
        for topic_id in range(50):
            current_topic_list = []
            current_topic = lda.show_topic(topic_id, topn=10)
            for topic_tuple in current_topic:
                current_topic_list.append(topic_tuple[0])
            output_topic_line += ' '.join(current_topic_list) + '\n'
            #print(current_topic_list)

        topic_file = os.path.join(cache_path, 'ldatopic.txt')
        with open(topic_file, 'w') as fo:
            fo.write(output_topic_line)

        testBatchIter = BatchIterBert(self.testDataIter,
                                      filling_last_batch=False,
                                      postProcessor=batchPostProcessor,
                                      batch_size=1)

        test_bow_list = []
        word_count = 0
        for item in testBatchIter:
            bow = item[1].squeeze().detach().numpy().tolist()
            word_count += sum(bow)
            test_bow_list.append(self.bow_2_gensim(bow))

        print(word_count)
        ppl = lda.log_perplexity(test_bow_list, len(test_bow_list))
        print(ppl)
        bound = lda.bound(test_bow_list)
        print(bound / word_count)
        print(np.exp2(-bound / word_count))