Beispiel #1
0
def main(folder):
    word2idx = pickle.load(open(os.path.join(folder, "word_idx.p"), "rb"))
    print(word2idx)
    # Load seed topics
    seed_topics_dic, topics = seed_topics(word2idx)
    
    idx_to_word = {v: k for k, v in word2idx.items()}
    # Load data
    print("Starting training...")
    lda = guidedlda.GuidedLDA(n_topics=len(topics), n_iter=100, random_state=7, refresh=20)
    
    
    ## Concat data
    row, col, data = np.array(()), np.array(()), np.array(())
    
    matrix_data_list = glob.glob(os.path.join(folder, "matrix_data_*.p"))
    np.random.shuffle(matrix_data_list)
    for doc in tqdm.tqdm(matrix_data_list):
        print("Partial fitting", doc)
        res = pickle.load(open(doc, "rb"))
        row = np.append(row, np.int32(res["I"]))
        col = np.append(col, np.int32(res["J"]))
        data = np.append(data, np.int32(res["data"]))
        X = coo_matrix((np.int32(data), (np.int32(row), np.int32(col))))
        
    lda.fit(X, seed_topics=seed_topics_dic, seed_confidence=0)
    
    print("Training done")
    def print_top_words(model, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            message = "Topic #{} - {}: ".format(topic_idx, topics[topic_idx])
            message += " ".join([idx_to_word[i]
                                 for i in topic.argsort()[:-n_top_words - 1:-1]])
            print(message)
    
    def print_sentence_and_topic(sentence, topic):
        print(colored("Sentence:", "blue"), colored(sentence, "green"))
        print(colored("Topic:   ", "blue"), colored(topic, "red"))
    
    print_top_words(lda, 20)
    np.save(open(os.path.join(folder, "guided_components.npy"), "wb"), lda.components_)
    
    ## Test for input sentences
    stemmer = WordNetLemmatizer() 
    while True:
        sentence = input()
        list_words = [w.lower() for w in sentence.split()]
        np_array = np.zeros([1, len(word2idx.keys())])
        for word in list_words:
            stemmed_word = stemmer.lemmatize(word)
            if stemmed_word in word2idx:
                print(stemmed_word)
                np_array[0, word2idx[stemmed_word]] += 1
        topic_dist = lda.transform(np.int32(np_array))
        print_sentence_and_topic(sentence, topics[np.argmax(topic_dist)])
Beispiel #2
0
 def run_lda_sklearn(self, n_topics):
     n_top_words = 12
     lda = LatentDirichletAllocation(n_topics=n_topics,
                                     max_iter=5,
                                     learning_method='online',
                                     learning_offset=50.,
                                     random_state=0)
     lda.fit(self.tf)
     print("\nTopics in LDA model:")
     tf_feature_names = self.tf_vectorizer.get_feature_names()
     self.print_top_words(lda, tf_feature_names, n_top_words)
Beispiel #3
0
count_vectorizer = CountVectorizer(
    max_df=0.99, min_df=3, ngram_range=(1, 1),
    stop_words=new_stop_word)  # Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(
    full_text_list_processed)  # Visualise the 30 most common words
plot_30_most_common_ngrams(count_data, count_vectorizer)
import warnings

warnings.simplefilter("ignore",
                      DeprecationWarning)  # Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA

import lda


# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join(
            [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))


# Tweak the two parameters below
number_topics = 20
number_words = 10  # Create and fit the LDA model
lda = lda.LDA(n_topics=number_topics)
lda.fit(count_data)  # Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
Beispiel #4
0
# # Part 5: Topic Modeling - Latent Dirichlet Allocation

# In[28]:


from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=5, learning_method = 'online')
tfidf_matrix_lda = (tfidf_matrix * 100)
tfidf_matrix_lda = tfidf_matrix_lda.astype(int)


# In[29]:


lda.fit(tfidf_matrix_lda)


# In[30]:


#5 group, 44 selected words
topic_word = lda.components_
print topic_word.shape


# In[31]:


n_top_words = 10
topic_keywords_list = []
Beispiel #5
0
X = np.array([[process(document,word) for word in all_words] 
											for document in corpus])
bar.finish()
'''
print("Extracting tf features for LDA...")
n_features = 1000
n_topics = 50 
n_top_words = 20 
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                stop_words='english')

tf = tf_vectorizer.fit_transform(corpus)
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
lda.fit(tf)
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

'''
rcParams['text.usetex'] = True
fig = plt.figure()
ax = fig.add_subplot(111)
ax.imshow(lda.components_[:,:n_top_words],interpolation='nearest',aspect='auto',cmap=plt.cm.bone_r)
artist.adjust_spines(ax)
ax.set_xticks(xrange(n_top_words))
ax.set_xticklabels(map(artist.format,tf_feature_names[:n_top_words]),rotation='vertical')
ax.set_ylabel(artist.format('Topic'))
plt.tight_layout()
plt.show()
'''
# preprocessor=None, stop_words=None, token_pattern=r"(?u)\b\w+\b", ngram_range=(1,1), max_features=None)
# tf = vectorizer.fit_transform(clean_content)
n_features = 1000
tf_vectorizer = TfidfVectorizer(strip_accents='unicode',
                                max_features=n_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(clean_content)

# 定义主题数量
n_topics = 1
lda = LatentDirichletAllocation(n_topics=n_topics,
                                max_iter=50,
                                learning_method='online',
                                learning_offset=50,
                                random_state=0)
lda.fit(tf)
# n_topics = 5
# model = lda.LDA(n_topics = n_topics,n_iter = 500,random_state = 1)
# model.fit(tf)
''''' 
#主题-单词分布 
topic_word = model.topic_word_ 
print("type(topic_word): {}".format(type(topic_word)))   
print("shape: {}".format(topic_word.shape))   
print(clean_content[:3])  
print(topic_word[:, :3])   

for n in range(5):   
    sum_pr = sum(topic_word[n,:])   
    print("topic: {} sum: {}".format(n, sum_pr))   
Beispiel #7
0
 def fit(self, params = solve_shared.Params(), callback = None):
   """Fits a model to this Corpus. params is a Params object from solve-shared. callback if provided should take two numbers - the first is the number of iterations done, the second the number of iterations that need to be done; used to report progress. Note that it will probably not be called for every iteration for reasons of efficiency."""
   lda.fit(self, params, callback)
Beispiel #8
0
                                           '一种','位于','之一','天空','没有','很多','有点','什么','五个',
                                           '特别','微博','链接','全文','展开','网页','自己','今天','现在','视频'],
                                max_df = 0.99,
                                min_df = 0.002) #去除文档内出现几率过大或过小的词汇

tf = tf_vectorizer.fit_transform(corpus)

print(tf.shape)
print(tf)

#-------------------------  第三步 LDA分析  ------------------------ 
# 设置主题数
n_topics = 1

lda = lda.LDA(n_topics=1, n_iter=1500, random_state=1)
lda.fit(tf.A.astype(np.int32))

# 显示主题数 model.topic_word_
print(lda.components_)
# 几个主题就是几行 多少个关键词就是几列 
print(lda.components_.shape)                         

# 主题-关键词分布
def print_top_words(model, tf_feature_names, n_top_words):
    for topic_idx,topic in enumerate(model.components_):  # lda.component相当于model.topic_word_
        print('Topic #%d:' % topic_idx)
        print(' '.join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]))
        print("")

# 定义好函数之后 暂定每个主题输出前20个关键词
n_top_words = 20                                       
cur = con.cursor()
cur.execute("select * from headlines")
results = cur.fetchall()

#tf-idf the articles
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
X = vectorizer.fit_transform([*map(lambda x: x['text'], results)])

for item in X[0]:
    print(item)

#print(vectorizer.get_feature_names())

svd = TruncatedSVD(n_components=100, n_iter=100)
lda = LatentDirichletAllocation(n_components=10)
L = lda.fit(X)
S = svd.fit(X)
#normalizer = Normalizer(copy=False)
#lsa = make_pipeline(svd, normalizer)
#X = lsa.fit_transform(X)

terms = vectorizer.get_feature_names()
for i, comp in enumerate(S.components_):
    termsInComp = zip(terms, comp)
    sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True)[:20]
    print("Concept %d:" % i)
    for term in sortedTerms:
        print(term[0])
    print(" ")

#jaccard similarity on vector
Beispiel #10
0
 def fit(self, params=solve_shared.Params(), callback=None):
     """Fits a model to this Corpus. params is a Params object from solve-shared. callback if provided should take two numbers - the first is the number of iterations done, the second the number of iterations that need to be done; used to report progress. Note that it will probably not be called for every iteration for reasons of efficiency."""
     lda.fit(self, params, callback)
Beispiel #11
0
matrix_data_list = glob.glob("ECJ_gendered/matrix_data_*.p")
np.random.shuffle(matrix_data_list)
for doc in tqdm.tqdm(matrix_data_list):
    if MODEL == "sklearn":
        row, col, data = np.array(()), np.array(()), np.array(())
    print("Partial fitting", doc)
    res = pickle.load(open(doc, "rb"))
    row = np.append(row, np.int32(res["I"]))
    col = np.append(col, np.int32(res["J"]))
    data = np.append(data, np.int32(res["data"]))
    X = coo_matrix((np.int32(data), (np.int32(row), np.int32(col))))
    if MODEL == "sklearn":
        lda.partial_fit(X)
if MODEL != "sklearn":
    lda.fit(X)
#    break

print("Training done")
def print_top_words(model, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([idx_to_word[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)

def print_sentence_and_topic(sentence, topic):
    print(colored("Sentence:", "blue"), colored(sentence, "green"))
    print(colored("Topic:   ", "blue"), colored(topic, "red"))

print_top_words(lda, 20)