Exemple #1
0
    def fit_transform(self, dataset: Dataset, name: str) -> TopicModel:
        # https://radimrehurek.com/gensim/models/ldamulticore.html#module-gensim.models.ldamulticore
        model = LdaMulticore(
            corpus=dataset.get_gensim_corpus(),
            id2word=dataset.get_gensim_vocab(),
            num_topics=self.n,
            random_state=get_seed(),
            **self.kwargs,
        )
        # To get back DT matrix https://github.com/bmabey/pyLDAvis/blob/master/pyLDAvis/gensim_models.py
        topic_word_matrix = model.get_topics()
        doc_topic_matrix = model.inference(dataset.get_gensim_corpus())[0]

        self.model = model
        return TopicModel.from_array(name, topic_word_matrix, doc_topic_matrix)
Exemple #2
0
def create_LDA_model(coursesList):
    warnings.filterwarnings('ignore')
    text_clean = [doc.split(' ') for doc in coursesList['description']]
    bigrams, trigrams = create_n_grams(text_clean)
    text_clean = add_n_grams(text_clean, bigrams, trigrams)

    id2word = Dictionary(text_clean)
    id2word.filter_extremes(no_below=5, no_above=0.45)
    corpus = [id2word.doc2bow(text) for text in text_clean]

    num_topics = config.num_lda_topic
    lda_model = LDA(corpus=corpus,
                    id2word=id2word,
                    num_topics=num_topics,
                    random_state=42,
                    alpha='asymmetric',
                    passes=25)
    lda_model.save("./best_model.lda")
    coherence_model_c_v = CoherenceModel(model=lda_model,
                                         texts=text_clean,
                                         dictionary=id2word,
                                         coherence='c_v')
    c_v = coherence_model_c_v.get_coherence()
    term_topic_mat = lda_model.get_topics()
    aver_cosine_similarities = 0
    for i in range(0, (num_topics - 1)):
        cosine_similarities = linear_kernel(term_topic_mat[i].reshape(1, -1),
                                            term_topic_mat[i + 1:]).flatten()
        aver_cosine_similarities += sum(cosine_similarities)
    if num_topics != 1:
        aver_cosine_similarities = aver_cosine_similarities / (
            num_topics * (num_topics - 1) / 2)
    print(c_v)
    print(aver_cosine_similarities)

    create_vector_topics(lda_model, corpus, id2word, coursesList)

    visual_data = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
    pyLDAvis.save_html(visual_data, 'topics.html')
    return lda_model, id2word, bigrams, trigrams
    print("Now Extracting Gibbs Signatures")
    ldamallet = gensim.models.wrappers.LdaMallet(mallet_path,
                                                 corpus=bagOfMutations,
                                                 num_topics=num_sigs,
                                                 id2word=idToChannel,
                                                 iterations=100,
                                                 topic_threshold=0.0)
    #hdpmodel = HdpModel(bagOfMutations, idToChannel, K=20, T=48)

    pickle.dump(ldamodel,
                open(output_path + project + '_lda_model.pickle', 'wb'))
    pickle.dump(ldamallet,
                open(output_path + project + '_mallet_model.pickle', 'wb'))
    #pickle.dump(hdpmodel, output_path + project + '_hdp_model.pickle')

    bayes_signatures = pd.DataFrame(ldamodel.get_topics().transpose())
    columns = []
    for i in range(bayes_signatures.shape[1]):
        columns.append("Signature " + alpha_dict[i])
    bayes_signatures.columns = columns
    channels = []
    for c in bayes_signatures.index:
        channels.append(ldamodel.id2word[c])

    if len(channels
           ) != 48:  #one channel had 0 counts for all samples in dataset
        channel_set = set(channels)
        for c in classification:
            if c not in channel_set:
                channels.append(c)
                bayes_signatures.loc[len(bayes_signatures)] = 0