Example #1
0
def build_and_save_lsi_model():
    print('Connecting to the database...')
    sentences = SentencesIterator(tokens_generator)
    dct = Dictionary(sentences)
    # Corpus as dictionary ids lists, in memory
    # Can be transformed in an iterable as done with the others if needed
    print('Calculating the LSI model...')
    bow_corpus = [dct.doc2bow(s) for s in sentences]
    model = LsiModel(bow_corpus, id2word=dct)
    model.print_debug()
    model.save(LSI_MODEL_FILE)
    for t in range(model.get_topics().shape[0]):
        print(t)
        print(model.print_topic(t))
Example #2
0
lsi_model = LsiModel(corpus)
#LSA(latent semantic analysis)潜在语义分析,也被称为LSI(latent semantic index),
#是一种新的索引和检索方法。该方法和传统向量空间模型(vector space model)一样使用向量来表示词(terms)和文档(documents),
#并通过向量间的关系(如夹角)来判断词及文档间的关系;而不同的是,LSA将词和文档映射到潜在语义空间。
#同义词和多义词如何导致传统向量空间模型检索精确度的下降。
#LSA潜在语义分析的目的,就是要找出词(terms)在文档和查询中真正的含义,也就是潜在语义,从而解决上节所描述的问题。

topic_id = 0
for topic in lsi_model.show_topics():
    topic_id+=1
    print ("TOPIC (LSI) " + str(topic_id) + " : ", topic)

print('#'*50)
print(lsi_model.num_topics)
for i in range(0, lsi_model.num_topics-1):
    if lsi_model.print_topic(i):
        print (lsi_model.print_topic(i))

corpus_tfidf = tfidf_model[corpus]
corpus_lsi = lsi_model[corpus]

lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300)
corpus_lsi_2 = lsi_model_2[corpus]
print ('完成创建模型')


print('*'*10, lsi_model_2 .print_topics(5))

topic_id = 0
for topic in lsi_model_2.show_topics():
    print ("TOPIC (LSI2) " , str(topic_id) , " : " , topic)
Example #3
0
if False:
    lsamodel = gensim.models.LsiModel.load(temp_file)
    keywords_score=[]
    if True:
        with open('lsa_topics_optimized.txt', 'w') as f:
            # f.write('Most important topics:\n')
            for index, row in df_test_jokes.iterrows():
                unseen_document=row['JokeText']
                #Data preprocessing step for the unseen document
                bow_vector = dictionary.doc2bow(preprocess(unseen_document))

                f.write(str(row['JokeId'])+"\n")
                # sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
                topic_scores=[0, 0, 0, 0, 0, 0, 0]
                for index, score in lsamodel[bow_vector]:
                    f.write("Score: {}\t Topic {}: {}".format(score, index, lsamodel.print_topic(index, 10))+"\n")
                    topic_scores[index]=score
                keywords_score.append(topic_scores)

                f.write("\n")

    print(len(keywords_score))
    print(len(keywords_score[0]))

    topics = lsamodel.print_topics(num_words=10)
    for topic in topics:
        print(topic)

    # euklidova daljina
    def distance(lista, listb):
        return sum( (b - a) ** 2 for a,b in zip(lista, listb) ) ** .5
Example #4
0
    text = find_text(nlp)
    sentences = sent_tokenize(text)
    text = text.lower()
    print(text)

    # tokenize text into sentences
    tokens = tokenize(text)

    # create dictionary of tokens
    dictionary = corpora.Dictionary(tokens)
    sent_term_matrix = [dictionary.doc2bow(sentence) for sentence in tokens]
    # Gensim's LsiModel performs Truncated SVD such that dimension of S = numtopics
    # if numtopics is not specified, performs SVD
    lsamodel = LsiModel(sent_term_matrix, id2word = dictionary)
    print(lsamodel.print_topic(0)) # good check; what terms does the model associate with the most important topic?
    print(lsamodel.print_topic(1))

    # Grab matrix V from SVD, A = USV^t
    V = corpus2dense(lsamodel[sent_term_matrix], len(lsamodel.projection.s)).T / lsamodel.projection.s
    
    # Output sentence with the longest vector lengths, no repeats
    lengths = find_length(lsamodel.projection.s, V)
    indices = find_indices(lengths, 3) # number of sentences printed = 5
    indices.sort() # a summary makes more sense in-order

    hypothesis = "" # the collection of chosen sentences
    for i in range(0, len(indices)):
        hypothesis += sentences[indices[i]]

    print(hypothesis)
Example #5
0
news_train = fetch_20newsgroups(subset='train')

# Tokenization and lemmatization
wnl = WordNetLemmatizer()
news_train_lemma = [
    tokenize_lemmatize(article, wnl.lemmatize) for article in news_train.data
]

# Build a genism corpara structure
dict_train = Dictionary(news_train_lemma)
mmCorpus_train = [dict_train.doc2bow(article) for article in news_train_lemma]

# Latent Semantic Analysis
lsi_train = LsiModel(corpus=mmCorpus_train, num_topics=40, id2word=dict_train)

for i in range(40):
    print('topic' + i.__str__() + ' :')
    print(lsi_train.print_topic(i))

# Latent Dirichlet Allocation

lda_train = LdaMulticore(corpus=mmCorpus_train,
                         num_topics=40,
                         id2word=dict_train,
                         workers=5)

for i in range(40):
    print('topic' + i.__str__() + ' :')
    print(lda_train.print_topic(i))