Ejemplo n.º 1
0
    def compute_values(self, kmin, kmax, kstep):
        # vectorize doc
        vec = CountVectorizer()
        X = vec.fit_transform(self.docs)
        
        # get vocabulary and biterms from docs
        vocab = np.array(vec.get_feature_names())
        biterms = vec_to_biterms(X)

        # create a BTM and pass the biterms to train it
        btm = oBTM(num_topics = 20, V = vocab)
        topics = btm.fit_transform(biterms, iterations=100)
        topic_summuary(btm.phi_wz.T, X, vocab, 10)
        
def biterm_topic_model_topic_extraction():
    """
    Function performs topic extraction on Tweets using the Gensim HDP model.

    :return: None.
    """
    # LDA can only use raw term counts for LDA because it is a probabilistic graphical model.
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
    tf = tf_vectorizer.fit_transform(slo_feature_series)
    tf_feature_names = tf_vectorizer.get_feature_names()

    log.info(f"\n.fit_transform - Learn the vocabulary dictionary and return term-document matrix.")
    log.info(f"{tf}\n")
    log.info(f"\n.get_feature_names - Array mapping from feature integer indices to feature name")
    log.info(f"{tf_feature_names}\n")

    # Convert corpus of documents (vectorized text) to numpy array.
    tf_array = tf.toarray()

    # Convert dictionary of words (vocabulary) to numpy array.
    tf_feature_names = np.array(tf_vectorizer.get_feature_names())

    # get biterms
    biterms = vec_to_biterms(tf_array)

    # create btm
    btm = oBTM(num_topics=20, V=tf_feature_names)

    print("\n\n Train Online BTM ..")
    for i in range(0, len(biterms), 100):  # prozess chunk of 200 texts
        biterms_chunk = biterms[i:i + 100]
        btm.fit(biterms_chunk, iterations=50)
    topics = btm.transform(biterms)
    time.sleep(3)

    # print("\n\n Visualize Topics ..")
    # vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(tf_array, axis=1), tf_feature_names, np.sum(tf_array, axis=0))
    # pyLDAvis.save_html(vis, './vis/online_btm.html')

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, tf_array, tf_feature_names, 10)

    print("\n\n Texts & Topics ..")
    for i in range(1, 10):
        print("{} (topic: {})".format(slo_feature_series[i], topics[i].argmax()))
Ejemplo n.º 3
0
if __name__ == "__main__":

    texts = open('./data/reuters.titles').read().splitlines()

    # vectorize texts
    vec = CountVectorizer(stop_words='english')
    X = vec.fit_transform(texts).toarray()

    # get vocabulary
    vocab = np.array(vec.get_feature_names())

    # get biterms
    biterms = vec_to_biterms(X)

    # create btm
    btm = oBTM(num_topics=20, V=vocab)

    print("\n\n Train Online BTM ..")
    for i in range(0, len(biterms), 100):  # prozess chunk of 200 texts
        biterms_chunk = biterms[i:i + 100]
        btm.fit(biterms_chunk, iterations=50)
    topics = btm.transform(biterms)

    print("\n\n Visualize Topics ..")
    vis = pyLDAvis.prepare(btm.phi_wz.T, topics, np.count_nonzero(X, axis=1),
                           vocab, np.sum(X, axis=0))
    pyLDAvis.save_html(vis, './vis/online_btm.html')

    print("\n\n Topic coherence ..")
    topic_summuary(btm.phi_wz.T, X, vocab, 10)
Ejemplo n.º 4
0
df_cl.query('cluster ==1')

# %%

# Biterm topic model

# get bigrams
from biterm.utility import vec_to_biterms

vocab = np.array(count_vect.get_feature_names())
biterms = vec_to_biterms(doc_term_matrix[:1000, :])
# %%

from biterm.cbtm import oBTM

btm = oBTM(num_topics=3, V=vocab)
topics = btm.fit_transform(biterms, iterations=100)
# %%
topics.shape
# %%

# Find subjects of sentences

import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)