def build_and_save_lsi_model(): print('Connecting to the database...') sentences = SentencesIterator(tokens_generator) dct = Dictionary(sentences) # Corpus as dictionary ids lists, in memory # Can be transformed in an iterable as done with the others if needed print('Calculating the LSI model...') bow_corpus = [dct.doc2bow(s) for s in sentences] model = LsiModel(bow_corpus, id2word=dct) model.print_debug() model.save(LSI_MODEL_FILE) for t in range(model.get_topics().shape[0]): print(t) print(model.print_topic(t))
lsi_model = LsiModel(corpus) #LSA(latent semantic analysis)潜在语义分析,也被称为LSI(latent semantic index), #是一种新的索引和检索方法。该方法和传统向量空间模型(vector space model)一样使用向量来表示词(terms)和文档(documents), #并通过向量间的关系(如夹角)来判断词及文档间的关系;而不同的是,LSA将词和文档映射到潜在语义空间。 #同义词和多义词如何导致传统向量空间模型检索精确度的下降。 #LSA潜在语义分析的目的,就是要找出词(terms)在文档和查询中真正的含义,也就是潜在语义,从而解决上节所描述的问题。 topic_id = 0 for topic in lsi_model.show_topics(): topic_id+=1 print ("TOPIC (LSI) " + str(topic_id) + " : ", topic) print('#'*50) print(lsi_model.num_topics) for i in range(0, lsi_model.num_topics-1): if lsi_model.print_topic(i): print (lsi_model.print_topic(i)) corpus_tfidf = tfidf_model[corpus] corpus_lsi = lsi_model[corpus] lsi_model_2 = LsiModel(corpus_tfidf, id2word=corpus.dictionary, num_topics=300) corpus_lsi_2 = lsi_model_2[corpus] print ('完成创建模型') print('*'*10, lsi_model_2 .print_topics(5)) topic_id = 0 for topic in lsi_model_2.show_topics(): print ("TOPIC (LSI2) " , str(topic_id) , " : " , topic)
if False: lsamodel = gensim.models.LsiModel.load(temp_file) keywords_score=[] if True: with open('lsa_topics_optimized.txt', 'w') as f: # f.write('Most important topics:\n') for index, row in df_test_jokes.iterrows(): unseen_document=row['JokeText'] #Data preprocessing step for the unseen document bow_vector = dictionary.doc2bow(preprocess(unseen_document)) f.write(str(row['JokeId'])+"\n") # sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]): topic_scores=[0, 0, 0, 0, 0, 0, 0] for index, score in lsamodel[bow_vector]: f.write("Score: {}\t Topic {}: {}".format(score, index, lsamodel.print_topic(index, 10))+"\n") topic_scores[index]=score keywords_score.append(topic_scores) f.write("\n") print(len(keywords_score)) print(len(keywords_score[0])) topics = lsamodel.print_topics(num_words=10) for topic in topics: print(topic) # euklidova daljina def distance(lista, listb): return sum( (b - a) ** 2 for a,b in zip(lista, listb) ) ** .5
text = find_text(nlp) sentences = sent_tokenize(text) text = text.lower() print(text) # tokenize text into sentences tokens = tokenize(text) # create dictionary of tokens dictionary = corpora.Dictionary(tokens) sent_term_matrix = [dictionary.doc2bow(sentence) for sentence in tokens] # Gensim's LsiModel performs Truncated SVD such that dimension of S = numtopics # if numtopics is not specified, performs SVD lsamodel = LsiModel(sent_term_matrix, id2word = dictionary) print(lsamodel.print_topic(0)) # good check; what terms does the model associate with the most important topic? print(lsamodel.print_topic(1)) # Grab matrix V from SVD, A = USV^t V = corpus2dense(lsamodel[sent_term_matrix], len(lsamodel.projection.s)).T / lsamodel.projection.s # Output sentence with the longest vector lengths, no repeats lengths = find_length(lsamodel.projection.s, V) indices = find_indices(lengths, 3) # number of sentences printed = 5 indices.sort() # a summary makes more sense in-order hypothesis = "" # the collection of chosen sentences for i in range(0, len(indices)): hypothesis += sentences[indices[i]] print(hypothesis)
news_train = fetch_20newsgroups(subset='train') # Tokenization and lemmatization wnl = WordNetLemmatizer() news_train_lemma = [ tokenize_lemmatize(article, wnl.lemmatize) for article in news_train.data ] # Build a genism corpara structure dict_train = Dictionary(news_train_lemma) mmCorpus_train = [dict_train.doc2bow(article) for article in news_train_lemma] # Latent Semantic Analysis lsi_train = LsiModel(corpus=mmCorpus_train, num_topics=40, id2word=dict_train) for i in range(40): print('topic' + i.__str__() + ' :') print(lsi_train.print_topic(i)) # Latent Dirichlet Allocation lda_train = LdaMulticore(corpus=mmCorpus_train, num_topics=40, id2word=dict_train, workers=5) for i in range(40): print('topic' + i.__str__() + ' :') print(lda_train.print_topic(i))