Example #1
0
def nmf_optimum_coherence(corpus, start, end, step):
    cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess(
        corpus=corpus)
    topic_numbers = []
    coherence_values = []

    for num_topics in range(start, end + 1, step):
        vectorizer = TfidfVectorizer()
        A = vectorizer.fit_transform(cleaned_data)
        nmf_model = sk_NMF(n_components=num_topics, init='nndsvd')
        W = nmf_model.fit_transform(A)  # document topic distribution
        H = nmf_model.components_  # topic word distribution
        feature_names = vectorizer.get_feature_names()  # terms
        doc_number = len(W)
        topic_number = len(H)
        word_count = 20
        word_distributions = []
        for topic in range(topic_number):
            top_indices = np.argsort(H[topic, :])[::-1]
            doc_list = []
            for term_index in top_indices[0:word_count]:
                doc_list.append(
                    [feature_names[term_index], H[topic, term_index]])
            word_distributions.append(doc_list)
        nmf_topics = [[word[0] for word in topic]
                      for topic in word_distributions]
        coherence = CoherenceModel(topics=nmf_topics,
                                   texts=data_tokens,
                                   dictionary=id2word).get_coherence()

        topic_numbers.append(num_topics)
        coherence_values.append(coherence)
    fig = go.Figure(data=go.Scatter(x=topic_numbers, y=coherence_values))
    return fig
Example #2
0
def LSA(corpus, n_topic):
    cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess(
        corpus=corpus)
    doc_number = len(data_tokens)
    lsi_model = LsiModel(corpus=corpus, num_topics=n_topic, id2word=id2word)
    coherence_v = coherence.coherence_value(model=lsi_model,
                                            tokens=data_tokens,
                                            dictionary=id2word)
    word_distributions = distributions.word_distribution(model=lsi_model,
                                                         n_topic=n_topic)
    topic_distributions = distributions.lsi_topic_distribution(
        doc_number=doc_number, model=lsi_model, corpus=corpus)
    doc_dist = distributions.lsi_doc_distribution(n_topic=n_topic,
                                                  doc_number=doc_number,
                                                  model=lsi_model,
                                                  corpus=corpus)
    output = {
        "filecount": doc_number,
        "coherence_value": float(coherence_v),
        "word_distributions": word_distributions,
        "topic_distributions": topic_distributions,
        "doc_dist": doc_dist,
        "data_tokens": data_tokens
    }

    return output
def LDA(corpus, n_topic):
    cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess(corpus=corpus)
    doc_number = len(data_tokens)
    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=n_topic,
                         random_state=100,
                         update_every=1,
                         passes=10,
                         alpha='auto',
                         per_word_topics=True,
                         minimum_probability=1e-8)
    coherence_v = coherence.coherence_value(model=lda_model, tokens=data_tokens, dictionary=id2word)
    word_distributions = distributions.word_distribution(model=lda_model, n_topic=n_topic)
    topic_distributions = distributions.lda_topic_distribution(doc_number=doc_number, model=lda_model, corpus=corpus)
    doc_dist = distributions.lda_doc_distribution(n_topic=n_topic, doc_number=doc_number, model=lda_model,
                                                  corpus=corpus)

    output = {"filecount": doc_number,
              "coherence_value": float(coherence_v),
              "word_distributions": word_distributions,
              "topic_distributions": topic_distributions,
              "doc_dist": doc_dist,
              "data_tokens": data_tokens
              }

    return output
def w2v_kmeans(corpus, n_clusters):
    doc_vectors, cleaned_data_tokens = doc_vector_generator(corpus)

    kmeans_model = KMeans(n_clusters=n_clusters, init='k-means++', n_init=40)
    kmeans_model.fit(doc_vectors)
    labels = kmeans_model.labels_.tolist()
    doc_number = len(labels)

    nested_corpus = []
    for i in range(n_clusters):
        nested_corpus.append([])

    for i in range(doc_number):
        nested_corpus[labels[i]].append(corpus[i])

    doc_dist = {}
    document_dists = np.array(labels)
    for cluster in range(n_clusters):
        doc_dist.update(
            {cluster: np.where(document_dists == cluster)[0].tolist()})

    topic_distributions = []
    for i in range(doc_number):
        topic_distributions.append([[labels[i], 1.0]])

    word_distributions = []
    for cluster_number in range(len(nested_corpus)):
        cleaned_data, data_tokens, id2word, corpus3 = preprocess.preprocess(
            corpus=nested_corpus[cluster_number])
        n_topic = 1
        lda_model = LdaModel(
            corpus=corpus3,
            id2word=id2word,
            num_topics=n_topic,
            random_state=100,
            update_every=1,
            # chunksize=50,
            passes=10,
            alpha='auto',
            per_word_topics=True,
            minimum_probability=1e-8)
        word_distributions.append(
            distributions.word_distribution(model=lda_model,
                                            n_topic=n_topic)[0])
    silhouette_score = metrics.silhouette_score(doc_vectors,
                                                labels,
                                                metric='cosine')
    output = {
        "filecount": doc_number,
        "silhouette_score": float(silhouette_score),
        "word_distributions": word_distributions,
        "topic_distributions": topic_distributions,
        "doc_dist": doc_dist,
        "data_tokens": cleaned_data_tokens,
        "labels": labels,
        "doc_vectors": [doc_vec.tolist() for doc_vec in doc_vectors]
    }
    return output
Example #5
0
def NMF(corpus, n_topic):
    cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess(
        corpus=corpus)

    vectorizer = TfidfVectorizer()
    A = vectorizer.fit_transform(cleaned_data)
    nmf_model = sk_NMF(n_components=n_topic, init='nndsvd')
    W = nmf_model.fit_transform(A)  # document topic distribution
    H = nmf_model.components_  # topic word distribution
    feature_names = vectorizer.get_feature_names()  # terms
    doc_number = len(W)
    topic_number = len(H)
    word_count = 20

    word_distributions = []
    for topic in range(topic_number):
        top_indices = np.argsort(H[topic, :])[::-1]
        doc_list = []
        for term_index in top_indices[0:word_count]:
            doc_list.append([feature_names[term_index], H[topic, term_index]])
        word_distributions.append(doc_list)

    topic_distributions = []
    for document in range(doc_number):
        topic_distributions.append([[topic, W[document][topic]]
                                    for topic in range(len(W[document]))])

    doc_dist = {}
    for i in range(topic_number):
        doc_dist.update({i: []})

    for i in range(doc_number):
        doc_dist[topic_distance.get_topic_dist_max(
            topic_distributions[i])[0]].append(i)

    nmf_topics = [[word[0] for word in topic] for topic in word_distributions]
    coherence = CoherenceModel(topics=nmf_topics,
                               texts=data_tokens,
                               dictionary=id2word).get_coherence()

    output = {
        "filecount": doc_number,
        "coherence_value": float(coherence),
        "word_distributions": word_distributions,
        "topic_distributions": topic_distributions,
        "doc_dist": doc_dist,
        "data_tokens": data_tokens
    }

    return output
Example #6
0
def lsa_optimum_coherence(corpus, start, end, step):
    cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess(
        corpus=corpus)
    topic_numbers = []
    coherence_values = []

    for num_topics in range(start, end + 1, step):
        lsi_model = LsiModel(corpus=corpus,
                             num_topics=num_topics,
                             id2word=id2word)
        coh = coherence.coherence_value(model=lsi_model,
                                        tokens=data_tokens,
                                        dictionary=id2word)
        topic_numbers.append(num_topics)
        coherence_values.append(coh)
    fig = go.Figure(data=go.Scatter(x=topic_numbers, y=coherence_values))
    return fig
def lda_optimum_coherence(corpus, start, end, step):
    cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess(corpus=corpus)
    topic_numbers = []
    coherence_values = []

    for num_topics in range(start, end + 1, step):
        lda = LdaModel(corpus=corpus,
                       id2word=id2word,
                       num_topics=num_topics,
                       random_state=100,
                       update_every=1,
                       passes=10,
                       alpha='auto',
                       per_word_topics=True,
                       minimum_probability=1e-8)
        coh = coherence.coherence_value(model=lda, tokens=data_tokens, dictionary=id2word)
        topic_numbers.append(num_topics)
        coherence_values.append(coh)
    fig = go.Figure(data=go.Scatter(x=topic_numbers, y=coherence_values))
    return fig
def HDP(corpus):
    n_topic = 150
    cleaned_data, data_tokens, id2word, corpus = preprocess.preprocess(
        corpus=corpus)
    doc_number = len(data_tokens)
    hdp_model = HdpModel(corpus=corpus, id2word=id2word)
    coherence_v = coherence.coherence_value(model=hdp_model,
                                            tokens=data_tokens,
                                            dictionary=id2word)

    topic_distributions = distributions.lsi_topic_distribution(
        doc_number=doc_number, model=hdp_model, corpus=corpus)
    topics_n = []
    for document in topic_distributions:
        for topic_dist in document:
            if topic_dist[0] not in topics_n:
                topics_n.append(topic_dist[0])
            first_index = topic_distributions.index(document)
            second_index = topic_distributions[first_index].index(topic_dist)
            topic_distributions[first_index][second_index][0] = topics_n.index(
                topic_dist[0])
    word_distributions = distributions.hdp_word_distribution(model=hdp_model,
                                                             topics_n=topics_n)
    doc_dist = distributions.hdp_doc_distribution(n_topic=n_topic,
                                                  topics_n=topics_n,
                                                  doc_number=doc_number,
                                                  model=hdp_model,
                                                  corpus=corpus)

    output = {
        "filecount": doc_number,
        "coherence_value": float(coherence_v),
        "word_distributions": word_distributions,
        "topic_distributions": topic_distributions,
        "doc_dist": doc_dist,
        "topics_n": topics_n,
        "data_tokens": data_tokens
    }

    return output