Exemple #1
0
from gensim import corpora, models, similarities, utils

#生成字典
dictionary = corpora.Dictionary(train_set)
#去除极低频的杂质词
dictionary.filter_extremes(no_below=1, no_above=1, keep_n=None)
#将词典保存下来,将语料也保存下来,语料转换成bow形式,方便后续使用
dictionary.save(output + "all.dic")
corpus = [dictionary.doc2bow(text) for text in train_set]
saveObject(output + "all.cps", corpus)
#存储原始的数据
saveObject(output + "all.info", docinfos)

#TF*IDF模型生成
#使用原始数据生成TFIDF模型
tfidfModel = models.TfidfModel(corpus)
#通过TFIDF模型生成TFIDF向量
tfidfVectors = tfidfModel[corpus]
#存储tfidfModel
tfidfModel.save(output + "allTFIDF.mdl")
indexTfidf = similarities.MatrixSimilarity(tfidfVectors)
indexTfidf.save(output + "allTFIDF.idx")

#LDA模型
lda = models.LdaModel(tfidfVectors, id2word=dictionary, num_topics=30)
lda.save(output + "allLDA50Topic.mdl")
corpus_lda = lda[tfidfVectors]
indexLDA = similarities.MatrixSimilarity(corpus_lda)
indexLDA.save(output + "allLDA50Topic.idx")
Exemple #2
0
#pprint(texts)

dictionary = corpora.Dictionary(texts)
#dictionary.save('/tmp/deerwester.dict')  # store the dictionary, for future reference
#print(dictionary)

corpus = [dictionary.doc2bow(text) for text in texts]
print corpus
'''
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
for doc in corpus_tfidf:
    print(doc)
'''

lda = models.LdaModel(corpus, id2word=dictionary, num_topics=50)
corpus_lsi = lda[corpus]
lda.print_topics(50)

for doc in corpus_lsi:
    print("********************DOCUMENTS*****************", doc)

prepared = pyLDAvis.prepare(**corpus_lsi.pyldavis_data())
pyLDAvis.display(prepared)

#for i in range(0, lda.num_topics-1):
#    print lda.print_topic(i)

#for doc in corpus_lsi: # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
#    print(doc)
'''
def train_Various_LDA_Models(corpus_NounAdj, id2word_nounAdj, listOfTwitsAndUniqueWords, runRegardless,
                             baseSavePath="/content/drive/MyDrive/Harvard HW/Course 2 - Final Project/omertest/",
                             overrideTrainSettings=True):
    if type(overrideTrainSettings) == dict:  # If empty, use combination below
        print('overrideTrainSettings, will use ONLY set values of 4 Topics + 10 Passes')
        numTopics = overrideTrainSettings['numTopics']
        numOfPasses = overrideTrainSettings['numOfPasses']
    else:
        numTopics = [8]
        numOfPasses = [2]
    print('Will test on %s topics, %s passes' % (numTopics, numOfPasses))

    outputSavePath = os.path.join(baseSavePath, "LDA_Topic_Model_Output.csv")
    runRegardless = True

    if not os.path.exists(outputSavePath) or runRegardless:
        if not os.path.exists(baseSavePath):
            os.makedirs(baseSavePath)

        ldaResultOutput = {}
        runCount = 0
        for top in numTopics:
            for passN in numOfPasses:
                runCount+=1

                ldaModelTitle = '\nLDA_%s_Topics_%s_Passes - RunCount: %s' % (top, passN,runCount)
                print(ldaModelTitle)
                # Start the clock:
                start_time = time()

                ldaResultOutput[ldaModelTitle] = {'TopicNum': top, 'PassNum': passN}

                lda_nounAdj = gensin_models.LdaModel(corpus=corpus_NounAdj, num_topics=top, passes=passN,
                                                     id2word=id2word_nounAdj,iterations=100)

                print("Created gensin_models.LdaModel")
                # perplexity
                Perplexity = lda_nounAdj.log_perplexity(corpus_NounAdj)

                # coherence score
                coherence_model = CoherenceModel(model=lda_nounAdj, texts=listOfTwitsAndUniqueWords.values(),
                                            dictionary=id2word_nounAdj, coherence='c_v',processes=1) ## processes must be 1 or else freeze issues

                print("Created CoherenceModel")
                try:
                    coherence = coherence_model.get_coherence()
                except Exception as e:
                    print("Exception when running coherence_model.get_coherence():")
                    print(e)
                    coherence=0

                ldaResultOutput[ldaModelTitle]['TopicNum'] = top
                ldaResultOutput[ldaModelTitle]['PassNum'] = passN
                ldaResultOutput[ldaModelTitle]['Perplexity'] = Perplexity
                ldaResultOutput[ldaModelTitle]['Coherence'] = coherence

                print('Num of topics: %s | Num of passes: %s | Perplexity: %s | Coherence Score: %s' % (
                top, passN, Perplexity, coherence))
                timeInSeconds = time() - start_time
                print('Finshed training %s in %s seconds.\n' % (ldaModelTitle, int(timeInSeconds)))

        ldaResultOutput_df = pd.DataFrame(ldaResultOutput).T.sort_values(by=['Perplexity'], ascending=True)

        ldaResultOutput_df.to_csv(outputSavePath)

    return ldaResultOutput_df
 def get_lda(self, num_topics=100):
     docs_corpus = [self.docs_dict.doc2bow(doc) for doc in self.docs]
     model_lda = models.LdaModel(docs_corpus, num_topics, id2word=self.docs_dict)
     docs_lda  = model_lda[docs_corpus]
     docs_vecs   = np.vstack([sparse2full(c, len(self.docs_dict)) for c in docs_lda])
     return docs_vecs
tokenized_text = [tokenize_only(text) for text in preprocess]

#remove stop words
texts = [[word for word in text if word not in stopwords] for text in tokenized_text]

dictionary = corpora.Dictionary(texts)

#remove extremes (similar to the min/max df step used when creating the tf-idf matrix)
dictionary.filter_extremes(no_below=1, no_above=0.8)

#convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in texts]
          
lda = models.LdaModel(corpus, num_topics=num_clusters, 
                            id2word=dictionary, 
                            update_every=5, 
                            chunksize=10000, 
                            passes=100)


topics_matrix = lda.show_topics(formatted=False, num_words=20)

for entry in topics_matrix:
    index = entry[0]
    words = entry[1]
    words.sort(key = lambda x:x[1],reverse=True)
    word = [x[0] for x in words]
    print(index,word[:5])
#    
#0 ['research', 'studi', 'water', 'use', 'found']
#1 ['wast', 'energi', 'use', 'recycl', 'solar']
'''
    sep设置分割词 由于csv默认以半角逗号为分割词,而改词恰好再停用词表中,因此会导致读取出错
    所以解决办法是手动设置一个不存在的分割词
'''
stop = [' ', ''] + list(stop[0])

baddata[1] = baddata[0].apply(lambda s: s.split(" "))
baddata[2] = baddata[1].apply(lambda x: [i for i in x if i not in stop])

gooddata[1] = gooddata[0].apply(lambda s: s.split(" "))
gooddata[2] = gooddata[1].apply(lambda x: [i for i in x if i not in stop])
'''
    负面主题分析
'''
bad_dict = corpora.Dictionary(baddata[2])
bad_corpus = [bad_dict.doc2bow(i) for i in baddata[2]]
bad_lda = models.LdaModel(bad_corpus, num_topics=3, id2word=bad_dict)
for i in range(3):
    print(bad_lda.print_topic(i))
    bad_lda.print_topic(i)

print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
'''
    正面主题分析
'''
good_dict = corpora.Dictionary(gooddata[2])
good_corpus = [good_dict.doc2bow(i) for i in gooddata[2]]
good_lda = models.LdaModel(good_corpus, num_topics=3, id2word=good_dict)
for i in range(3):
    print(good_lda.print_topic(i))
    good_lda.print_topic(i)
Exemple #7
0
    stem_text
]


def preprocessing(corpus):
    for document in corpus:
        doc = strip_numeric(document)
        doc = remove_stopwords(doc)
        doc = strip_short(doc, 3)
        #doc = stem_text(doc)
        doc = strip_punctuation(doc)
        strip_tags(doc)
        yield gensim.utils.tokenize(doc, lower=True)


texts = preprocessing(corpus)
dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=1, keep_n=25000)

doc_term_matrix = [
    dictionary.doc2bow(tokens) for tokens in preprocessing(corpus)
]
tfidf = models.TfidfModel(doc_term_matrix)
corpus_tfidf = tfidf[doc_term_matrix]

lda = models.LdaModel(corpus_tfidf, num_topics=10, id2word=dictionary)
topics = lda.print_topics(num_words=25)
for i in topics:
    print(i[0])
    print(i[1])
Exemple #8
0
from gensim import corpora, models, similarities
from db2text import get_tweets
from db2text import clean_txt

tweets = get_tweets()
clean_tweets = []
for tweet in tweets:
    clean_tweet = clean_txt(tweet)
    clean_tweets.append(clean_tweet)

# 用文本构建Gensim字典
dictionary = corpora.Dictionary(clean_tweets)
dic_keys = list(dictionary.key())
print(dic_keys[0])
# 将字典转化为词袋模型(bag of words)作为参考
corpus = [dictionary.doc2bow(tweet) for tweet in clean_tweets]
lda = models.LdaModel(corpus,
                      num_topic=10,
                      id2word=dictionary,
                      update_exery=5,
                      chunksize=10000,
                      passes=100)

lda.show_topics()
topics_matrix = lda.show_topics(formatted=False, num_words=20)
topics_matrix = np.array(topics_matrix)

topic_words = topics_matrix[:, :, 1]
for i in topic_words:
    print([str(word) for word in i])
Exemple #9
0
def train_lda_model_gensim(corpus, total_topics=5): 
    norm_tokenized_corpus = normalize_corpus(corpus, lemmatize= False, tokenize=True) #normalize
    dictionary = corpora.Dictionary(norm_tokenized_corpus)          #create a dictionary for your corpus
    corpus_bow = [dictionary.doc2bow(text) for text in norm_tokenized_corpus] #create bag of words
    lda = models.LdaModel(corpus_bow, id2word = dictionary, iterations=1000, num_topics=total_topics) #define model
    return lda
Exemple #10
0
time_taken = time2 - time1
print(time_taken)

df_doc_term.tail()
#%%
# TOPIC MODELLING - COUNT VECTORIZER
# 1. LDA , Genism
time1 = time.time()
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(sparse_matrix)
print(corpus)
# Map matrix rows to words (tokens)
id2word = dict((v, k) for k, v in word_vectorizer.vocabulary_.items())
len(id2word)
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5)
# 10 most important words for each of the 3 topics
lda.print_topics()
# Transform the docs from word space to topic space
lda_corpus = lda[corpus]
# Store the doc topic vectors in a list for review
lda_docs = [doc for doc in lda_corpus]
# Find the document vectors in the topic space for the first 10 documents
lda_docs[0:10]
time2 = time.time()
time_taken = time2 - time1
print(time_taken)

#%%
lda.print_topics()
Exemple #11
0
RANDOM_STATE = 1

# Database and other resources
DATABASE_PATH = config['paths']['database']
LDA_PATH = config['paths']['lda']
DICTIONARY_PATH = config['paths']['dictionary']
CORPUS_PATH = config['paths']['corpus']

# Execution
content = Content(DATABASE_PATH)
dictionary = corpora.Dictionary(content)
# Remove words that appear less than 5 times and that are in more than in 80% documents
dictionary.filter_extremes(no_below=5, no_above=0.8)
corpus = [dictionary.doc2bow(text) for text in content]

# LDA Model
lda = models.LdaModel(corpus,
                      id2word=dictionary,
                      random_state=RANDOM_STATE,
                      num_topics=NUM_TOPICS,
                      passes=NUM_PASSES)

# Save resources
lda.save(LDA_PATH)
with open(DICTIONARY_PATH, 'wb') as fp:
    pickle.dump(dictionary, fp)
fp.close()
with open(CORPUS_PATH, 'wb') as fp:
    pickle.dump(corpus, fp)
fp.close()
Exemple #12
0
def main():
    corpus = {}
    with open('corpus_data/preprocessedf_corpus.json') as corpus:
        corpus = json.loads(corpus.read().encode('utf-8'))

    corpus_2 = defaultdict(str)
    for artist, songlist in corpus.items():
        for song in songlist:
            lyrics = song['lyrics'].strip('\\')
            corpus_2[artist] += lyrics
    features = {}
    with open('corpus_data/artist_features.json') as features:
        features = json.loads(features.read())

    finalcorpus = []

    for artist, lyrics in corpus_2.items():
        d = {}
        d['artist'] = artist
        d['lyrics'] = lyrics
        d['pos'] = features[artist]['pos_counts']
        finalcorpus.append(d)

    df = pd.DataFrame(finalcorpus)

    # nltk.download('wordnet')
    from nltk.corpus import wordnet as wn

    def get_lemma(word):
        lemma = wn.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma

    """TOPIC MODELING HOPEFULLY"""
    import re
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    STOPWORDS = stopwords.words('english')
    PROFANITY = set()
    with open('corpus_data/rapsw.txt') as infile:
        infile = infile.read()
        infile = infile.split()
        for el in infile:
            PROFANITY.add(el)

    def clean_text(text, ar):
        tokenized_text = word_tokenize(text.lower())
        tokenized_text = [token for token in tokenized_text if len(token) > 5]
        cleaned_text = [
            t for t in tokenized_text
            if re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)
        ]
        if ar == 'sw':
            cleaned_text = [t for t in cleaned_text if t not in STOPWORDS]
        if ar == 'lm':
            cleaned_text = [get_lemma(token) for token in cleaned_text]
        if ar == 'rw':
            cleaned_text = [
                token for token in cleaned_text if token not in PROFANITY
            ]
        return cleaned_text

    for index, row in df.iterrows():
        row['lyrics'] = clean_text(row['lyrics'], sys.argv[1])
    from gensim import models, corpora
    from gensim.corpora.dictionary import Dictionary
    from gensim.test.utils import common_corpus, common_texts, get_tmpfile

    all_lyrics = []
    all_artists = []
    for index, row in df.iterrows():
        all_lyrics.append(row['lyrics'])
        all_artists.append(row['artist'])

    #common_dictionary = Dictionary(common_texts)
    #common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    #lda_model = models.LdaModel(common_corpus, num_topics=10)
    dictionary = corpora.Dictionary(all_lyrics)
    corpus = [dictionary.doc2bow(text) for text in all_lyrics]

    NUM_TOPICS = 25
    lda_model = models.LdaModel(corpus=corpus,
                                num_topics=25,
                                id2word=dictionary,
                                passes=20)

    topics = lda_model.print_topics(num_words=4)
    print('LDA Topics')
    for topic in topics:
        print(topic)

    lsi_model = models.LsiModel(corpus=corpus,
                                num_topics=NUM_TOPICS,
                                id2word=dictionary)
    topics = lsi_model.print_topics(num_words=4)
    print('LSI TOPICS')
    for topic in topics:
        print(topic)

    from gensim import similarities

    text = ""
    with open(sys.argv[2]) as inf:
        inf = inf.read()
        text = inf

    bow = dictionary.doc2bow(clean_text(text, sys.argv[1]))
    lda_index = similarities.MatrixSimilarity(lda_model[corpus])
    lsi_index = similarities.MatrixSimilarity(lsi_model[corpus])
    # Let's perform some queries
    similarities = lda_index[lda_model[bow]]
    # Sort the similarities
    similarities = sorted(enumerate(similarities), key=lambda item: -item[1])

    similaritiesLSI = lsi_index[lsi_model[bow]]

    similaritiesLSI = sorted(enumerate(similaritiesLSI),
                             key=lambda item: -item[1])

    # Top most similar documents:
    #print(similarities[:10])
    # [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]

    # Let's see what's the most similar document
    document_id, similarity = similarities[0]
    document_id2, similarityLSI = similaritiesLSI[0]

    # print(all_lyrics[document_id][:1000])
    print("LDA : TOP 5 Similar ARTISTS")
    for el in similarities[:5]:
        print(all_artists[el[0]])

    print('')
    print('LSI : Top 5 Similar Artists')
    for el in similaritiesLSI[:5]:
        print(all_artists[el[0]])
Exemple #13
0
import jieba, os
from gensim import corpora, models, similarities

train_set = []

walk = os.walk('C:\\Users\\Sun Yutian\\Desktop\\test')
for root, dirs, files in walk:
    for name in files:
        f = open(os.path.join(root, name), 'r')
        raw = f.read()
        word_list = list(jieba.cut(raw, cut_all=False, HMM=True))
        train_set.append(word_list)

dic = corpora.Dictionary(train_set)
corpus = [dic.doc2bow(text) for text in train_set]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=10)
corpus_lda = lda[corpus_tfidf]

f = open('dat', 'r')
raw = f.read()
word_list = list(jieba.cut(raw, cut_all=False, HMM=True))
vec_bow = dic.doc2bow(word_list)
vec_lda = lda[vec_bow]

index = similarities.MatrixSimilarity(lda[corpus])
sims = index[vec_lda]
print(list(enumerate(sims)))
Exemple #14
0
def LDA_post(infile, outfile, topic = 14):
    docs = []
    # f = open(infile, 'r')
    # line = f.readline()
    # while line:
    #   docs.append(line.lower().split('\t')[1])
    #   line = f.readline()
    # f.close()

    with open(infile, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter = ',', quotechar = '"')
        header = next(spamreader)
        for row in spamreader:
            docs.append(row[1])

    texts = []
    widgets = [FormatLabel('Processed: %(value)d records (in: %(elapsed)s)')]
    pbar = ProgressBar(widgets = widgets)
    for doc in pbar((doc for doc in docs)):
        texts.append([word for word in wordProcBase.tokenize_tweet(doc) if word not in stopwords.words('english')])
        # doc = wordProcBase.tokenize5(doc.decode('utf-8'))
        # texts.append([word for word in doc if word not in stopwords.words('english')])
    pbar.finish()

    pprint.pprint(texts)
    return

    # create a Gensim dictionary form the texts
    dictionary = corpora.Dictionary(texts)

    # remove extrems
    dictionary.filter_extremes(no_below = 1, no_above = 0.85)

    # convert the dictionary to a bag of words corpus for reference
    corpus = [dictionary.doc2bow(text) for text in texts]

    print ('Applying LDA...')
    lda = models.LdaModel(corpus, num_topics = topic, id2word = dictionary, update_every = 1, chunksize = 10000, passes = 100, minimum_probability = 0.001)

    topics = lda.show_topics(num_topics = topic, num_words = 5)

    # pprint.pprint(lda.print_topics(num_topics = topic)) 

    # pprint.pprint(topics)

    print ('Writing results into file...')
    # 結果寫入文件
    with open(outfile, 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')

        top_prob = lda.get_document_topics(corpus) #a list of (topic_id, topic_probability) 2-tuples
        index = 1
        for prob in top_prob:
            string = [0 for i in range(topic)]
            prob = sorted(prob, key = operator.itemgetter(0), reverse = False)
            for i, p in prob:
                string[i] = p
            spamwriter.writerow(string)
            index += 1

    return

    '''
    # reading unseen data
    '''
    print ('Reading unseen data...')
    unseen = _MAIN_DIR_ + "/Data/VA_Proc/emtion_tweets/survey/google_survey_data.csv"
    docs = []
    with open(unseen, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter = ',', quotechar = '"')
        for row in spamreader:
            docs.append(row[1])
    texts = []
    for doc in docs:
        texts.append([word for word in wordProcBase.tokenize3(doc.decode('utf-8')) if word not in stopwords.words('english')])

    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below = 1, no_above = 0.85)
    corpus = [dictionary.doc2bow(text) for text in texts]

    with open(outfile, 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')

        top_prob = lda.get_document_topics(corpus)
        index = 1
        for prob in top_prob:
            string = [index]
            for i in xrange(0, len(prob)):
                string.append(prob[i][1])
            spamwriter.writerow(string)
            index += 1
Exemple #15
0
    caselist,idlist = getDataFromMongo(col1)
    print("out mongo")
    print("calist lenth: %d" % len(caselist))

    print('build dictionary')
    dictionary = corpora.Dictionary(caselist)
    #dictionary.save('lda.dct')
    dict_len = len(dictionary)
    # transform the whole texts to sparse vector
    corpus = [dictionary.doc2bow(case) for case in caselist]
    print(len(corpus))

    print('build lda')
    num_topics = 6
    # create a transformation, from tf-idf model to lda model
    lda = models.LdaModel(corpus, num_topics=num_topics, id2word=dictionary,
          alpha=0.01, eta=0.01, minimum_probability=0.001, update_every = 1, chunksize = 100, passes = 1)
    print('out lda')
    #lda.save('lda.model')

    doc_topics = lda.get_document_topics(corpus)

    dislist = getDis(doc_topics, num_topics)

    print(len(idlist), len(dislist))
    write2mongo(col2, idlist, dislist)


    # num_show_term = 10   # 每个主题下显示几个词
    # for topic_id in range(num_topics):
    #     logging.info('第%d个主题的词与概率如下:\t' % topic_id)
    #     term_distribute_all = lda.get_topic_terms(topicid=topic_id)
def construct_lda_sim_graph(corpus, args):
    """
    compute lda vector similarity between paragraphs
    :param corpus:
    :param args:
    :return:
    """
    sim_graph = []
    raw_corpus = [' '.join(para) for para in corpus]

    # create English stop words list
    stoplist = set(stopwords.words('english'))
    # Create p_stemmer of class PorterStemmer
    p_stemmer = PorterStemmer()

    # Lowercase each document, split it by white space and filter out stopwords
    texts = [[word for word in para.lower().split() if word not in stoplist]
             for para in raw_corpus]
    # Create a set of frequent words
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    # stem each word
    processed_corpus = [[p_stemmer.stem(token) for token in text]
                        for text in texts]

    dictionary = corpora.Dictionary(processed_corpus)

    if len(dictionary) < args.num_topics:
        return None

    bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]

    # train the model
    if args.find_opt_num:
        lda = get_optimal_ldamodel_by_coherence_values(corpus=bow_corpus,
                                                       texts=processed_corpus,
                                                       dictionary=dictionary)
    else:
        lda = models.LdaModel(corpus=bow_corpus,
                              num_topics=args.num_topics,
                              id2word=dictionary,
                              alpha='auto',
                              eta='auto',
                              eval_every=None,
                              minimum_probability=0.0)
    # LdaMulticore(bow_corpus, id2word=dictionary, num_topics=args.num_topics, eta='auto',
    # eval_every=None, minimum_probability=0.0)

    corpus_lda = lda[
        bow_corpus]  # create a double wrapper over the original corpus: bow->lda
    index = similarities.MatrixSimilarity(corpus_lda,
                                          num_features=len(dictionary))

    print("corpus_lda[0]: %s" % str(corpus_lda[0]))

    total = 0.
    count_large = 0.
    for i in range(len(corpus_lda)):
        sim = index[corpus_lda[i]]

        assert len(sim) == len(corpus_lda), "the lda sim is not correct!"
        sim_graph.append(sim)

        for s in sim:
            total += 1
            if s > args.sim_threshold:
                count_large += 1

    print("sim_graph[0]: %s" % str(sim_graph[0]))
    return sim_graph, count_large, total
Exemple #17
0
    print(sentences)

    ############################################################################################################################
    print('***********************')
    ################################输出这段文本(包含所有句子列表)的主题词##################
    film_dict = corpora.Dictionary(sentences)

    i = 0
    for w in film_dict.values():
        i += 1
        print(i, w)

    film_corpus = [film_dict.doc2bow(i) for i in sentences]
    print(film_corpus)
    #
    film_lda = models.LdaModel(film_corpus, num_topics=3, id2word=film_dict)
    for i in range(3):
        print(film_lda.print_topic(i))  #输出每个主题

########################################################################################

    ####################################利用Word2Vec模型进行词向量化计算,计算词汇相关程度#############################################################################
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    #sentences = word2vec.Text8Corpus(u"E:\word.txt",encoding= 'utf-8')
    w = []
    for i in range(100):
        model = gensim.models.Word2Vec(sentences,
                                       size=200)  # 训练skip-gram模型; 默认window=5

        print(model.similarity(u'美国', u'经济'))  # 计算两个词的相似度/相关程度
def lda(export_perplexity=False):
    np.set_printoptions(linewidth=300)
    data = pd.read_csv('QQ_chat_result.csv', header=0, encoding='utf-8')
    texts = []
    for info in data['Info']:
        texts.append(info.split(' '))
    M = len(texts)
    print('文档数目:%d个' % M)
    # pprint(texts)

    print('正在建立词典 --')
    dictionary = corpora.Dictionary(texts)
    V = len(dictionary)
    print('正在计算文本向量 --')
    corpus = [dictionary.doc2bow(text) for text in texts]
    print('正在计算文档TF-IDF --')
    t_start = time.time()
    corpus_tfidf = models.TfidfModel(corpus)[corpus]
    print('建立文档TF-IDF完成,用时%.3f秒' % (time.time() - t_start))
    print('LDA模型拟合推断 --')
    num_topics = 20
    t_start = time.time()
    lda = models.LdaModel(corpus_tfidf,
                          num_topics=num_topics,
                          id2word=dictionary,
                          alpha=0.001,
                          eta=0.02,
                          minimum_probability=0,
                          update_every=1,
                          chunksize=1000,
                          passes=20)
    print('LDA模型完成,训练时间为\t%.3f秒' % (time.time() - t_start))
    if export_perplexity:
        export_perplexity1(corpus_tfidf, dictionary, corpus)
        # export_perplexity2(corpus_tfidf, dictionary, corpus)
    # # 所有文档的主题
    # doc_topic = [a for a in lda[corpus_tfidf]]
    # print 'Document-Topic:\n'
    # pprint(doc_topic)

    num_show_term = 7  # 每个主题显示几个词
    print('每个主题的词分布:')
    for topic_id in range(num_topics):
        print('主题#%d:\t' % topic_id, end=' ')
        term_distribute_all = lda.get_topic_terms(topicid=topic_id)
        term_distribute = term_distribute_all[:num_show_term]
        term_distribute = np.array(term_distribute)
        term_id = term_distribute[:, 0].astype(np.int)
        for t in term_id:
            print(dictionary.id2token[t], end=' ')
        print('\n概率:\t', term_distribute[:, 1])

    # 随机打印某10个文档的主题
    np.set_printoptions(linewidth=200, suppress=True)
    num_show_topic = 10  # 每个文档显示前几个主题
    print('10个用户的主题分布:')
    doc_topics = lda.get_document_topics(corpus_tfidf)  # 所有文档的主题分布
    idx = np.arange(M)
    np.random.shuffle(idx)
    idx = idx[:10]
    for i in idx:
        topic = np.array(doc_topics[i])
        topic_distribute = np.array(topic[:, 1])
        # print topic_distribute
        topic_idx = topic_distribute.argsort()[:-num_show_topic - 1:-1]
        print(('第%d个用户的前%d个主题:' % (i, num_show_topic)), topic_idx)
        print(topic_distribute[topic_idx])
    # 显示着10个文档的主题
    mpl.rcParams['font.sans-serif'] = ['SimHei']
    mpl.rcParams['axes.unicode_minus'] = False
    plt.figure(figsize=(8, 7), facecolor='w')
    for i, k in enumerate(idx):
        ax = plt.subplot(5, 2, i + 1)
        topic = np.array(doc_topics[i])
        topic_distribute = np.array(topic[:, 1])
        ax.stem(topic_distribute, linefmt='g-', markerfmt='ro')
        ax.set_xlim(-1, num_topics + 1)
        ax.set_ylim(0, 1)
        ax.set_ylabel("概率")
        ax.set_title("用户 {}".format(k))
        plt.grid(b=True, axis='both', ls=':', color='#606060')
    plt.xlabel("主题", fontsize=13)
    plt.suptitle('用户的主题分布', fontsize=15)
    plt.tight_layout(1, rect=(0, 0, 1, 0.95))
    plt.show()

    # 计算各个主题的强度
    print('\n各个主题的强度:\n')
    topic_all = np.zeros(num_topics)
    doc_topics = lda.get_document_topics(corpus_tfidf)  # 所有文档的主题分布
    for i in np.arange(M):  # 遍历所有文档
        topic = np.array(doc_topics[i])
        topic_distribute = np.array(topic[:, 1])
        topic_all += topic_distribute
    topic_all /= M  # 平均
    idx = topic_all.argsort()
    topic_sort = topic_all[idx]
    print(topic_sort)
    plt.figure(facecolor='w')
    plt.stem(topic_sort, linefmt='g-', markerfmt='ro')
    plt.xticks(np.arange(idx.size), idx)
    plt.xlabel("主题", fontsize=13)
    plt.ylabel("主题出现概率", fontsize=13)
    plt.title('主题强度', fontsize=15)
    plt.grid(b=True, axis='both', ls=':', color='#606060')
    plt.show()
def cal_sim_by_lda(tf_idf, dictionary, corpus):
    print('正在通过LDA模型计算文档之间的相似度......')
    lda = models.LdaModel(tf_idf, id2word=dictionary, num_topics=5)
    corpus_lda = lda[corpus]
    index = similarities.MatrixSimilarity(corpus_lda)
    return index
Exemple #20
0
    seg = seg_file(contents[0], stopwords)
    # 建立字典
    dictionary = corpora.Dictionary(seg)
    V = len(dictionary)
    print(V)

    # 统计文档词频矩阵
    text = [dictionary.doc2bow(text, allow_update=True) for text in seg]
    #print(text[0])#稀疏矩阵

    #计算Tfidf矩阵
    text_tfidf = models.TfidfModel(text)[text]

    #建立LDA模型,输出前十个主题
    lda = models.LdaModel(text_tfidf,
                          id2word=dictionary,
                          num_topics=200,
                          iterations=100)

    #显示主题
    for k, v in lda.print_topics(num_topics=10):
        print(k, v)
    #所有文档的主题
    doc_topic = lda.get_document_topics(text_tfidf)
    print(len(doc_topic))
    for dt in doc_topic:
        print(dt)
        d = dict(dt)
        ret = sorted(d.items(), key=lambda x: x[1], reverse=True)[0]
        print(ret[0])
        for k, v in lda.print_topics(num_topics=200):
            if k == ret[0]:
import numpy as np
import os
import xlwt
file = open('C:\\Users\\Administrator\\Desktop\\original_lda\\ddata.txt', 'r')
#courses =[];
#for line in file:
#    courses.append(line.strip().split(' '))#将文本中的字符依据空格一字字分开,变成list
courses = [line.strip().split() for line in file]
dic = corpora.Dictionary(courses)  #为每个单词分配一个id
corpus = [dic.doc2bow(text) for text in courses]  #把文档doc变成一个稀疏矩阵
corpus_tfidf = models.TfidfModel(corpus)[corpus]
numoftopics = 10
lda = models.LdaModel(corpus_tfidf,
                      id2word=dic,
                      alpha=0.01,
                      eta=0.05,
                      iterations=2000,
                      num_topics=numoftopics,
                      minimum_probability=0.0001)
#文件j表示主题-词分布
f = open(
    'C:\\Users\\Administrator\\Desktop\\original_lda\\ml_period5topic25a0.01j.txt',
    'w')
for topic_id in range(numoftopics):
    f.write('*Topic:' + str(topic_id))
    f.write(str(lda.show_topic(topic_id)))
    f.write('\n')
#文件f用来表示每个文档和各个主题的分布
fenbu = open(
    'C:\\Users\\Administrator\\Desktop\\original_lda\\ml_period4qtopic20a0.01f.txt',
    'w')
Exemple #22
0
from gensim import corpora, models

d = [['想', '买辆', '汽车'], ['我', '买辆', '汽车', '汽车', '喜欢']]
dictionary = corpora.Dictionary(d)
dictionary.save('./gensim.dict')
corpora.Dictionary.load('./gensim.dict')
print(dictionary)

corpus = [dictionary.doc2bow(text) for text in d]  # bag of words
print(corpus)

lda = models.LdaModel(corpus=corpus, num_topics=2, id2word=dictionary)
print(lda.print_topics(2))
print(lda[corpus[0]])
Exemple #23
0
def gensim_preprocess(train,
                      test,
                      model_type='lsi',
                      num_topics=500,
                      use_own_tfidf=False,
                      force_compute=False,
                      report_progress=False,
                      data_dir='data/',
                      **tfidf_params):
    """Use topic modeling to create a dense matrix representation of the input text.

    Notes
    -----
    In many cases I create artifacts (corpora, tfidf representations etc.) for:
        1. Training set
        2. Test set
        3. Their concatenation

    The concatenation is used to create the models, i.e compute embeddings since the labels are not needed in this unsupervised stage.
    The first two are used for the training and evaluation/submission stages accordingly.

    Parameters
    ----------
    :param train: The training set as a pd.Dataframe including the free text column "comment_text".
    :param test: The test set as a pd.Dataframe including the free text column "comment_text".
    :param model: Dimensionality reduction model to be used, can be 'lsi', 'lda' for now (more might be added later).
    :param num_topics: Number of columns (features) in the output matrices.
    :use_own_tfidf: If true, our own version of tfidf will be used with **tfidf_params passed to it
    :force_compute: If True we will not even try to load but instead compute everything. Set it if you want to try
                    different parameters.
    :report_progress: If True, progress will be reported when each computationally expensive step is starting.
    :data_dir: Path to the base data directory. Used to call this method from anywhere.
               For example a notebook would provide `data_dir='../data'`
    :**tfidf_params: Key-Value parameters passed to our own `tf_idf` implementation.
                     Only used if `use_own_tfidf` is set to True.

    Returns
    -------
    :return: (train, test) datasets as 2D np.ndarrays of shape (num_comments, `num_topics`)
    """

    # Folder where gensim models and data will be saved to and loaded from.
    gensim_dir = data_dir + 'gensim/'

    def progress(msg):
        """Helper to conditionally print progress messages to std:out."""
        if report_progress:
            print(msg)

    if force_compute:
        progress(
            "This is gonna take a while mate, grab a coffee/beer. Actually you might wanna take a walk as well. Or a nap :D"
        )

    train_text = train["comment_text"].tolist()
    test_text = test["comment_text"].tolist()

    # Tokenize
    def safe_tokenize(comment):
        """Wrap `nltk.word_tokenize` but also handle corrupted input."""
        try:
            return nltk.word_tokenize(comment)
        except TypeError:
            return ["UNKNOWN"]

    progress("Tokenizing text, this will take a while...")
    train_texts = [safe_tokenize(comment) for comment in train_text]
    test_texts = [safe_tokenize(comment) for comment in test_text]

    dictionary = corpora.Dictionary(train_texts + test_texts)

    # Lets create the TF-IDF representation needed for the dimensionality reduction models.
    if use_own_tfidf:
        # Untested yet but I hope it works. I mean, why wouldn't it right?
        progress("Using our own version of TF-IDF, this will take a while...")
        train_tfidf, test_tfidf, whole_tfidf = tf_idf(train, test,
                                                      **tfidf_params)

    else:
        # Use gensims TFIDF model - Tested while under the influence of 10 beers.
        # I code well when drunk though so no worries.

        # Read or create the corpus
        try:
            # Hack to redirect to the exception handler - yes I know its bad but I like it mmmkay?
            if force_compute:
                raise FileNotFoundError
            train_corpus = corpora.MmCorpus(gensim_dir + 'training_corpus.mm')
            test_corpus = corpora.MmCorpus(gensim_dir + 'test_corpus.mm')
            whole_corpus = corpora.MmCorpus(gensim_dir + 'whole_corpus')
        except FileNotFoundError:
            progress("Creating the gensim corpora, this will take a while...")
            train_corpus = [
                dictionary.doc2bow(comment) for comment in train_texts
            ]
            test_corpus = [
                dictionary.doc2bow(comment) for comment in test_texts
            ]
            whole_corpus = [
                dictionary.doc2bow(comment)
                for comment in train_texts + test_texts
            ]
            corpora.MmCorpus.serialize(gensim_dir + 'training_corpus.mm',
                                       train_corpus)
            corpora.MmCorpus.serialize(gensim_dir + 'test_corpus.mm',
                                       test_corpus)
            corpora.MmCorpus.serialize(gensim_dir + 'whole_corpus.mm',
                                       whole_corpus)

        progress(
            "Using gensim's implementation of TF-IDF, this will take a while..."
        )
        tfidf_model = models.TfidfModel(whole_corpus)
        train_tfidf = tfidf_model[train_corpus]
        test_tfidf = tfidf_model[test_corpus]
        whole_tfidf = tfidf_model[train_corpus + test_corpus]

    # Feed the TF-IDF representation to the dimensionality reduction model - this is slow so try to load it first.
    if model_type == 'lsi':
        try:
            # Hack to redirect to the exception handler - yes I know its bad but I like it mmmkay?
            if force_compute:
                raise FileNotFoundError
            model = models.LsiModel.load(gensim_dir + 'lsi.model')
        except FileNotFoundError:
            progress("Creating the LSI model, this will take a while...")
            model = models.LsiModel(whole_tfidf,
                                    id2word=dictionary,
                                    num_topics=num_topics)
            model.save(gensim_dir + 'lsi.model')

    elif model_type == 'lda':
        try:
            # Hack to redirect to the exception handler - yes I know its bad but I like it mmmkay?
            if force_compute:
                raise FileNotFoundError
            model = models.LdaModel.load('data/lda.model')
        except FileNotFoundError:
            progress("Creating the LDA model, this will take a while...")
            model = models.LdaModel(whole_tfidf,
                                    id2word=dictionary,
                                    num_topics=num_topics)
            model.save(gensim_dir + 'lda.model')

    else:
        raise ValueError(
            "Only 'lda' and 'lsi' models are supported, you passed {}".format(
                model_type))

    train = model[train_tfidf]
    test = model[test_tfidf]

    # Transform into a 2D array format.
    print("Reformatting output to a 2D array, this will take a while...")
    values = np.vectorize(lambda x: x[1])
    return values(np.array(train)), values(np.array(test))
Exemple #24
0
hclda = AgglomerativeClustering( n_clusters=5, affinity = 'euclidean', linkage = 'ward')
y_hclda = hclda.fit_predict(Xlda_cluster)



'''pyLDAvis'''
texts = [[word for word in document.lower().split() if word not in stop_words]

          for document in corpus]
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once]
          for text in texts]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=10,passes=10)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda,corpus,dictionary)
vis


'''PCA plot'''

#kmeansbowPCA
from sklearn.decomposition import PCA
bow_pca = PCA(n_components=2)
BowComponents = bow_pca.fit_transform(X_bow)
BowDf = pd.DataFrame(data = BowComponents, columns = ['bow component 1', 'bow component 2'])

bow_centers = bow_pca.transform(modelkmeansbow.cluster_centers_)
Exemple #25
0
 def train_lda(self):
     lda = models.LdaModel(self.corpus_tfidf,
                           id2word=self.dictionary,
                           num_topics=self.num_topics)
     return lda
#
# tfidf = models.TfidfModel(corpus, id2word=dictionary, dictionary=dictionary, normalize=True)
# tfidf.save(settings.TF_IDF_MODEL)
# query = 'oil and gas'
# from src.engine.preprocess import preprocess_body_lda
# query = preprocess_body_lda(query)
# corpus_query = [dictionary.doc2bow(query.split(" "))]
# transformed = tfidf[corpus_query]
#
# logentropy = models.LogEntropyModel(tfidf[corpus], id2word=dictionary, normalize=True)
# logentropy.save(settings.LOGENTROPY_MODEL)

# logentropy_query = logentropy[transformed]
lsi = models.LdaModel(corpus,
                      id2word=dictionary,
                      num_topics=30,
                      passes=3,
                      alpha='auto',
                      chunksize=4000)
lsi.save(settings.LDA_MODEL)

lsi = models.LdaModel.load(settings.LDA_MODEL)
from gensim.similarities import MatrixSimilarity
similarity_matrix = MatrixSimilarity(lsi[corpus], num_features=100)
similarity_matrix.save(settings.SIMILARITY_MATRIX)

# similarities = similarity_matrix.get_similarities(lsi[logentropy_query])

#
#
#
Exemple #27
0
    print '\nLSI Model:'
    lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary)
    topic_result = [a for a in lsi[corpus_tfidf]]
    pprint(topic_result)
    print 'LSI Topics:'
    pprint(lsi.print_topics(num_topics=2, num_words=5))
    similarity = similarities.MatrixSimilarity(
        lsi[corpus_tfidf])  # similarities.Similarity()
    print 'Similarity:'
    pprint(list(similarity))

    print '\nLDA Model:'
    num_topics = 2
    lda = models.LdaModel(corpus_tfidf,
                          num_topics=num_topics,
                          id2word=dictionary,
                          alpha='auto',
                          eta='auto',
                          minimum_probability=0.001)
    doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]
    print 'Document-Topic:\n'
    pprint(doc_topic)
    for doc_topic in lda.get_document_topics(corpus_tfidf):
        print doc_topic
    for topic_id in range(num_topics):
        print 'Topic', topic_id
        # pprint(lda.get_topic_terms(topicid=topic_id))
        pprint(lda.show_topic(topic_id))
    similarity = similarities.MatrixSimilarity(lda[corpus_tfidf])
    print 'Similarity:'
    pprint(list(similarity))
Exemple #28
0
tokens = vanilla_tokenize(texts)
chunks100 = vanilla_chunk(tokens, 100)
prune98 = vanilla_prune(chunks100, 98, 0)
lemmanoun = vanilla_lemmatizer(prune98)

## train LDA model
from gensim import corpora, models
# bag-of-words
dictionary = corpora.Dictionary(lemmanoun)
corpus = [dictionary.doc2bow(chunk) for chunk in lemmanoun]
# for reproducibility
fixed_seed = 1234
np.random.seed(fixed_seed)
# train model on k topics
k = 20
mdl = models.LdaModel(corpus,
                      id2word=dictionary,
                      num_topics=k,
                      chunksize=3125,
                      passes=25,
                      update_every=0,
                      alpha=None,
                      eta=None,
                      decay=0.5,
                      distributed=False)
# print topics
for i in range(0, k):
    print 'Topic', i + 1
    print(mdl.show_topic(i))
    print('-----')
from gensim import corpora, models
np.set_printoptions(threshold=np.nan)

numSemantics = 4
genre = input("Enter Genre: ")
mat = svd.genSVDMatrix(genre)
if (len(mat) < numSemantics or len(mat[0]) < numSemantics):
    print("cant report top semantics")
else:
    svdSem = svd.svdCalc(mat, numSemantics)
    pcaSem = svd.svdCalc(np.matmul(np.transpose(mat), mat), numSemantics)
    allTags = db.getAllTags()
    print("\n\nSVD Decomposed top semantics:")
    for sem in svdSem:
        print("\n\n", utils.rankSem(sem, allTags))
    print("\n\nPCA Decomposed top semantics:")
    for sem in pcaSem:
        print("\n\n", utils.rankSem(sem, allTags))
X = lda1.ldaInputTags(genre)
dictionary = corpora.Dictionary(X)
#print(dictionary)
#print(dictionary.token2id)
corpus = [dictionary.doc2bow(x) for x in X]
#print(corpus)
ldamodel = models.LdaModel(corpus, id2word=dictionary, num_topics=numSemantics)
ldaSems = ldamodel.print_topics(num_topics=-1, num_words=len(dictionary))
print("\n\nLDA Decomposed top semantics:")
for sem in ldaSems:
    print("\n\n", sem)
print("\nThe above ids are Tag IDs")
Exemple #30
0
for text in data:
    tokenized_data.append(clean_text(text))

# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)

# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...

# Build the LDA model
lda_model = models.LdaModel(corpus=corpus,
                            num_topics=NUM_TOPICS,
                            id2word=dictionary)

# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus,
                            num_topics=NUM_TOPICS,
                            id2word=dictionary)

print("LDA Model:")

for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))

print("=" * 20)