def gensim_similarity(data_c): """ 使用Gensim包计算相似度: 词频 COUNT LDA LSI Tfidf: TFIDF LDA LSI """ # 合并获取词袋 data_c['s1'] = data_c['s1'].apply(lambda text: list(text)) data_c['s2'] = data_c['s2'].apply(lambda text: list(text)) data_c_all = data_c['s1'].append(data_c['s2'], ignore_index=True).to_frame(name='s') # 构建词典 print("starting create dic....") dic = corpora.Dictionary(data_c['s1'].values) dic.add_documents(data_c['s2'].values) print("文档数:", dic.num_docs) print("starting create count bow...") data_c['s1'] = data_c['s1'].apply(lambda text: dic.doc2bow(text)) data_c['s2'] = data_c['s2'].apply(lambda text: dic.doc2bow(text)) data_c_all['s'] = data_c_all['s'].apply(lambda text: dic.doc2bow(text)) # cps1 = [dic.doc2bow(text) for text in list(data_c['s1'].values)] # cps2 = [dic.doc2bow(text) for text in list(data_c['s2'].values)] cps1 = list(data_c['s1']) cps2 = list(data_c['s2']) cps = list(data_c_all['s']) # 计算s1,s2词频相似度 print("starting count similarity....") sm = similarities.SparseMatrixSimilarity(corpus=cps1, num_features=10000) count_sm = np.diag(sm[cps2]) # 计算s1,s2词频LDA,LSI相似度 count_lda_sm = lda_similarity(cps, cps1, cps2, dic) # count_lsi_sm= lsi_similarity(cps,cps1,cps2,dic) # 计算s1,s2 tfidf相似度 print("starting tfidf similarity....") tfidf = TfidfModel(corpus=cps, id2word=dic) cps1_tfidf = tfidf[cps1] cps2_tfidf = tfidf[cps2] cps_tfidf = tfidf[cps] # 计算s1,s2 TFIDF相似度 sm = similarities.SparseMatrixSimilarity(corpus=cps1_tfidf, num_features=10000) tfidf_sm = np.diag(sm[cps2_tfidf]) # 计算s1,s2词频LDA,LSI相似度 tfidf_lda_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic) tfidf_lsi_sm = lda_similarity(cps_tfidf, cps1_tfidf, cps2_tfidf, dic) return count_sm, count_lda_sm, tfidf_sm, tfidf_lda_sm, tfidf_lsi_sm
def __init__(self, doc_set, hash_length=32, words_per_token=1): if hash_length > self.MAX_HASH_LENGTH: raise Exception( "The specified hash length is too long. It must be 128 bits or less" ) self.hash_length = hash_length self.documents = doc_set.documents docs = [title + " " + body for title, body in self.documents.items()] self.doc_list = [title for title, body in self.documents.items()] self.inverted_doc_index = {} for index, title in enumerate(self.doc_list): self.inverted_doc_index[title] = index texts = [[ word for word in document.lower().split() if word not in stopwords.words('english') ] for document in docs] self.dictionary = corpora.Dictionary(texts) self.corpus = [self.dictionary.doc2bow(text) for text in texts] self.tfidf = models.TfidfModel(self.corpus) self.index = similarities.SparseMatrixSimilarity( self.tfidf[self.corpus], num_features=len(self.dictionary)) self.simhash_dict = {} for ind, v in enumerate(self.corpus): self.simhash_dict[self.doc_list[ind]] = self.create_hash( self.tfidf[v])
def index_events(self, event_id_list=None): """ Index the Events based on its indexes """ # Event selection by Id (if provided) if event_id_list: event_corpus = [ self.corpus_of_bows[self.dict_event_id_index[event_id]] for event_id in event_id_list ] else: event_corpus = self.corpus_of_bows # Applying the TFIDF Transformation (if necessary) if self.tfidf_model: transformed_corpus = self.model[self.tfidf_model[event_corpus]] else: transformed_corpus = self.model[event_corpus] # Create the index of the transformed_corpus to submit queries # We use the SparseMatrixSimilarity that uses a sparse data structure instead of a dense one # That's why we have to provide the num_features parameter self.corpus_query_index = similarities.SparseMatrixSimilarity( transformed_corpus, num_features=len(self.dictionary))
def create_index(self, docs_with_urls): logger.info("Creating index out of {} documents".format( len(docs_with_urls))) urls, doc_bows = zip(*self.infer_all(docs_with_urls)) self.urls = urls self.index = similarities.SparseMatrixSimilarity( doc_bows, num_features=self.topics)
def find_similar_research(): research = select('url, plaintext as "value" from maincol where url != ?;', [reference_person]) research.extend( select('url, plaintext as "value" from maincol where url = ?;', [reference_person])) documents = [row['value'].strip() for row in research] stoplist = set('for a of the and to in'.split()) texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in documents] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] vec = corpus[-1] #The person being compared to tfidf = models.TfidfModel(corpus) index = similarities.SparseMatrixSimilarity(tfidf[corpus]) sims = index[tfidf[vec]] print list(enumerate(sims)) save(['url'], [{ "url": row[0], "similarity": row[1][1] } for row in zip([row['url'] for row in research], list(enumerate(sims)))], 'similarity')
def build_model(): sentences = get_sentences() words_split = [sentence.split(' ') for sentence in sentences] # remove words that appear only once from collections import defaultdict frequency = defaultdict(int) for text in words_split: for token in text: frequency[token] += 1 words_split = [[token for token in text if frequency[token] > 1] for text in words_split] dictionary = corpora.Dictionary(words_split) corpus = [dictionary.doc2bow(text) for text in words_split] model = models.TfidfModel(corpus) feature_num = len(dictionary.token2id.keys()) index = similarities.SparseMatrixSimilarity(model[corpus], num_features=feature_num) return dictionary, index, model
def creat_main(str1,str2): #建立停用词 stop_words=['。',',','!','?','……'] #进行分词 str1_list=[] for line in str1: str1_words=' '.join(jieba.cut(line)).split(' ') doc_txt=[] for word in str1_words: if word not in stop_words: doc_txt.append(word) str1_list.append(doc_txt) str2_words=' '.join(jieba.cut(str2)).split(' ') str2_list=[] for word in str2_words: if word not in stop_words: str2_list.append(word) #对原文进行处理,形成词袋 dictionary=corpora.Dictionary(str1_list) #对词袋中的词进行编号 dictionary.keys() #使用doc2bow制作语料库 corpus=[dictionary.doc2bow(word) for word in str1_list] #对测试文档也进行制作语料库 test_words_vec=dictionary.doc2bow(str2_list) #利用tfidf模型对语料库建模 tfidf=models.TfidfModel(corpus) #对每个目标文档,分析测试文档的相似度 index=similarities.SparseMatrixSimilarity(tfidf[corpus],num_features=len(dictionary.keys())) sim=index[tfidf[test_words_vec]] print('相似度为%s','%.5f'% max(sim))
def get_affinity_matrix(allTextList, gensimDict=False): print 'clustering: get_affinity_matrix' start = datetime.now() # 构建字典 if gensimDict is False: # 去停用词 stoplist = set('for a of the and to in'.split()) texts = [[ word for word in text.lower().split() if word not in stoplist ] for text in allTextList] # 去低频词 k = 2 frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > k] for text in texts] dictionary = corpora.Dictionary( texts) # dictionary.doc2bow()将语料中的句子字符串转换为 由每个词对应的(id,freq)组成的列表 else: dictionary = gensimDict texts = [text.lower().split() for text in allTextList] corpus = [dictionary.doc2bow(text) for text in texts] featureNum = len(dictionary.token2id.keys()) index = similarities.SparseMatrixSimilarity(corpus, num_features=featureNum) sims = [index[i] for i in corpus] print 'clustering: get_affinity_matrix end with ', (datetime.now() - start).seconds, 's' return np.array(sims)
def build_gensim_model(corpus): text_corpus = [el['text'] for el in corpus] # Create a set of frequent words stoplist = set('for a of the and to in'.split(' ')) # Lowercase each document, split it by white space and filter out stopwords texts = [[word for word in document.lower().split() if word not in stoplist] for document in text_corpus] # Count word frequencies from collections import defaultdict frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 # Only keep words that appear more than once processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts] #pprint.pprint(processed_corpus) from gensim import corpora dictionary = corpora.Dictionary(processed_corpus) bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus] from gensim import models # train the model tfidf = models.TfidfModel(bow_corpus) # transform the "system minors" string words = "dog food".lower().split() print(tfidf[dictionary.doc2bow(words)]) from gensim import similarities index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12)] query_document = 'dog food'.split() query_bow = dictionary.doc2bow(query_document) sims = index[tfidf[query_bow]] print(list(enumerate(sims))) return
def main(): posts = grabPosts() posts_combined = { x['post_id']: '{0} {1}'.format(x['title'], x['text']) for x in posts } keys = posts_combined.keys() documents = posts_combined.values() # sanitize and build our corpus sentences = [sanitize_sentence(sentence) for sentence in documents] dictionary = corpora.Dictionary(sentences) corpus = [dictionary.doc2bow(sentence) for sentence in sentences] # find out how many features we have and train the model. last_corpus = corpus[len(corpus) - 1] num_features = last_corpus[len(last_corpus) - 1][0] + 1 tfidf = models.TfidfModel(corpus) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=num_features) # now that the corpus is settled, go through each post and compute similarity. for key, value in posts_combined.iteritems(): vec = dictionary.doc2bow(sanitize_sentence(value)) sims = index[tfidf[vec]] p = list(enumerate(sims)) top_ten = sorted(p, key=lambda x: x[1], reverse=True)[1:11] dic = sorted([(keys[x[0]], x[1]) for x in top_ten], key=lambda x: x[1], reverse=True) EVALUATED_POSTS[key] = dic
def test_miislita_high_level(self): # construct corpus from file corpusname = datapath('miIslita.cor') miislita = CorpusMiislita(corpusname) # initialize tfidf transformation and similarity index tfidf = models.TfidfModel(miislita, miislita.dictionary, normalize=False) index = similarities.SparseMatrixSimilarity(tfidf[miislita], num_features=len( miislita.dictionary)) # compare to query query = 'latent semantic indexing' vec_bow = miislita.dictionary.doc2bow(query.lower().split()) vec_tfidf = tfidf[vec_bow] # perform a similarity query against the corpus sims_tfidf = index[vec_tfidf] # for the expected results see the article expected = [0.0, 0.2560, 0.7022, 0.1524, 0.3334] for i, value in enumerate(expected): self.assertAlmostEqual(sims_tfidf[i], value, 2)
def infer(self): courses = [ list(set(stop_words(item).remove())) for item in [w.split() for w in self.Courses] ] classes = list(set(stop_words(self.File_class).remove())) dictionary = corpora.Dictionary(courses) feature_cnt = len(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in courses] tfidf = models.TfidfModel(corpus) kw_vector = dictionary.doc2bow(classes) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt) sim = index[tfidf[kw_vector]] course_rec = dict(zip(sim, self.Names)) course_sort = sorted(course_rec.items(), reverse=True) lda_model = models.LdaMulticore(tfidf[corpus], num_topics=10, id2word=dictionary, passes=2, workers=2) for idx, topic in lda_model.print_topics(-1): print('Topic: {} \nWords: {}'.format(idx, topic)) for index, score in sorted(lda_model[tfidf[kw_vector]], key=lambda tup: -1 * tup[1]): print("\nScore: {}\t \nTopic: {}".format( score, lda_model.print_topic(index, 10))) return course_sort
def get_similarity_rate(all_doc: List[str], doc_test: str) -> List[float]: bad_word = '[,."#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?。。' '"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、\u3000、〃〈〉《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏﹑﹔·!?]+' doc_test_cleaned = re.sub(bad_word, "", doc_test) all_doc_cleaned = [re.sub(bad_word, "", doc) for doc in all_doc] if not doc_test_cleaned or not all_doc_cleaned: return [0, 0] all_doc_list = [[word for word in jieba.cut(doc)] for doc in all_doc_cleaned] doc_test_list = [word for word in jieba.cut(doc_test_cleaned)] dictionary = corpora.Dictionary([doc_test_list]) corpus = [dictionary.doc2bow(doc) for doc in all_doc_list] doc_test_vec = dictionary.doc2bow(doc_test_list) model = models.TfidfModel(corpus) index = similarities.SparseMatrixSimilarity(model[corpus], num_features=len( dictionary.keys())) sim = index[model[doc_test_vec]] max_index = np.argmax(sim) max_value = sim[max_index] return [max_index, max_value]
def judgement(self,key_text,compared_text): ''' 相似度判断函数 :return: 相似度 ''' texts = [compared_text, ''] key_text = key_text texts = [self.cut(text) for text in texts] # print(texts) dictionary = corpora.Dictionary(texts) feature_cnt = len(dictionary.token2id.keys()) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) new_vec = dictionary.doc2bow(self.cut(key_text)) # 相似度计算 index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_cnt) # print('\nTF-IDF模型的稀疏向量集:') # for i in tfidf[corpus]: # print(i) # print('\nTF-IDF模型的keyword稀疏向量:') # print(tfidf[new_vec]) sim = index[tfidf[new_vec]] # self.log("相似度:%s" % sim[0]) return sim[0]
def test(): # 构建匹配语料库 398872 samples sku_names_texts = get_train_datas() sku_names_jieba = get_text_jieba(sku_names_texts) # 测试数据 1000 samples keywords_texts = get_test_datas() keywords_jieba = get_text_jieba(keywords_texts) # 统计词表 dictionary = corpora.Dictionary(sku_names_jieba) corpus = [dictionary.doc2bow(sku_name) for sku_name in sku_names_jieba] # 加载已训练的模型 print("Model is loading...") tfidf = models.TfidfModel.load("models/tfidf_v2") print("Model has loaded !") # 相似度 index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len( dictionary.keys())) for i, item in enumerate(keywords_jieba): item_vec = dictionary.doc2bow(item) sims = index[tfidf[item_vec]] idx = list(sims).index(max(list(sims))) print(i, "||", keywords_texts[i], "||", sku_names_texts[idx]) with open("result/tfidf_v2_results.txt", 'a', encoding='utf8') as wf: wf.write( str(i) + "||" + keywords_texts[i] + "||" + sku_names_texts[idx] + "\n")
def sim(self,all_docs_words,test_doc_words): # 首先用dictionary方法获取词袋(bag-of-words) dictionary=corpora.Dictionary(all_docs_words) #词袋中用数字对所有词进行了编号 dictionary.keys() #编号与词之间的对应关系 dictionary.token2id #使用doc2bow制作语料库 corpus = [dictionary.doc2bow(doc_words) for doc_words in all_docs_words] #对测试文档分词转换为二元向量 doc_test_vec = dictionary.doc2bow(test_doc_words) #使用TF-IDF模型对语料库建模 tfidf = models.TfidfModel(corpus) #获取测试文档中,每个词的TF-IDF值 print(tfidf[doc_test_vec]) #对每个目标文档,分析测试文档的相似度 index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys())) sim = index[tfidf[doc_test_vec]] #根据相似度排序 result=sorted(enumerate(sim), key=lambda item: -item[1]) return result
def setup_models(self): start = time.time() print("Preparing corpus dictionary and vector...") #corpus_documents = [str(src).split() for src in self.eng_corpus] corpus_documents = [ simple_preprocess(str(src)) for src in self.eng_corpus ] self.dictionary = corpora.Dictionary(corpus_documents) corpus_vector = [ self.dictionary.doc2bow(tokens) for tokens in corpus_documents ] print("\tCorpus dictionary and vector completed, time cost: {}".format( round(time.time() - start, 2))) start = time.time() feature_cnt = len(self.dictionary.token2id) self.tfidf = models.TfidfModel(corpus_vector, smartirs='nnc') self.similarities = similarities.SparseMatrixSimilarity( self.tfidf[corpus_vector], num_features=feature_cnt) print("\tTFIDF and similarity matrix completed, time cost: {}".format( round(time.time() - start, 2))) print("\nSerializing corpus dictionary, tfidf and similarities... ") self.dictionary.save(str(self.serialize_dict)) self.tfidf.save(str(self.serialize_tfidf)) self.similarities.save(str(self.serialize_similarities)) # corpora.MmCorpus.serialize(self.serialize_vector, self.corpus_vector) print("Serialization done.")
def check(content, db): contentList = getContent(db) #print(type(contentList[0][1])) all_list = [] for row in contentList: all_list.append(row[1]) cut_list = [] for doc in all_list: doc_list = [word for word in jieba.cut(doc)] cut_list.append(doc_list) #print(cut_list) cut_content = [word for word in jieba.cut(content)] dictionary = corpora.Dictionary(cut_list) dictionary.keys() #dictionary.token2id corpus = [dictionary.doc2bow(doc) for doc in cut_list] cut_content_vrc = dictionary.doc2bow(cut_content) tfidf = models.TfidfModel(corpus) #tfidf[cut_content_vrc] index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len( dictionary.keys())) sim = index[tfidf[cut_content_vrc]] #result = list(sim) result = list(map(float, sim)) return result
def vsm(data): documents = [] for item in data: documents.append(item[0]) documents.append(item[1]) dictionary = corpora.Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] tfidf_model = models.TfidfModel(corpus) vectors = [tfidf_model[bow] for bow in corpus] sim = similarities.SparseMatrixSimilarity(tfidf_model[corpus], num_features=len( dictionary.keys())) mrr = 0 hit = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0] for i in range(len(data)): query_id = i * 2 query_tfidf = vectors[query_id] sim_result = sim[query_tfidf] rank = 0 for id, item in enumerate(sim_result): if id % 2 == 1 and item >= sim_result[query_id + 1]: rank += 1 mrr += 1.0 / rank for k in range(len(hit)): if rank <= k + 1: hit[k] += 1 print( '#%d:' % int(data[i][2]), 'rank=%d' % rank, 'MRR=%.4f' % (mrr / (i + 1)), ', '.join([('Hit@%d=%.4f' % (k + 1, (h / (i + 1)))) for k, h in enumerate(hit)]))
def _freq_train(self): print('\t\t 1. Frequency training...', end='') self._freq_dict = corpora.Dictionary(self.words) bow_list = [self._freq_dict.doc2bow(text) for text in self.words] self._freq_index = similarities.SparseMatrixSimilarity( bow_list, num_features=len(self._freq_dict)) print(' done.')
def genModel(self): if len(self.conds.keys()) == 0: return cnt = 0 for key, vals in self.conds.items(): for val in vals: self.conds_list.append(val) self.sent2cond[cnt] = key cnt += 1 self.conds_list.append('不知道你说的啥') self.sent2cond[cnt] = '不知道你说的啥' choice_cut = [] for i in self.conds_list: data1 = '' this_data = jieba.cut(i) for item in this_data: data1 += item + ' ' choice_cut.append(data1) docs = choice_cut tall = [[w1 for w1 in doc.split()] for doc in docs] self.dictionary = corpora.Dictionary(tall) corpus = [self.dictionary.doc2bow(text) for text in tall] self.tfidf = models.TfidfModel(corpus) print(self.tfidf) num = len(self.dictionary.token2id.keys()) self.index = similarities.SparseMatrixSimilarity(self.tfidf[corpus], num_features=num) for key, val in self.children.items(): val.genModel()
def main(): texts = preprocess_doc(DOC_PATH) #比较文档集 new_text = preprocess_doc(NEW_DOC_PATH) #新文档 frequency = defaultdict(int) #统计词频 for text in texts: for word in text: frequency[word] += 1 dictionary = corpora.Dictionary(texts) #定义一个字典 corpus_bow = [dictionary.doc2bow(text) for text in texts ] #词袋模型,得到每一篇文档的稀疏向量表示,向量的每一个元素代表了一个word在这篇文档中出现的次数。 new_bow = dictionary.doc2bow(new_text[0]) tfidf_model = models.TfidfModel( corpus_bow ) #tfidf模型,其中corpus是返回bow向量的迭代器,将完成对corpus中出现的每一个特征的IDF值的统计工作。 new_tfidf = tfidf_model[ corpus_bow] #可以调用这个itidf_model将任意一段语料(依然是bow向量的迭代器)转化成TFIDF向量的迭代器 new_vec_tfidf = tfidf_model[new_bow] featureNum = len(dictionary.token2id.keys()) index = similarities.SparseMatrixSimilarity(new_tfidf, num_features=featureNum) sim = index[new_vec_tfidf] display(sim) return 0
def __init__(self, documents): dictionary = corpora.Dictionary(documents) corpus = [dictionary.doc2bow(doc) for doc in documents] tfidf_model = models.TfidfModel(corpus) self.vectors = [tfidf_model[bow] for bow in corpus] self.sim = similarities.SparseMatrixSimilarity(tfidf_model[corpus], num_features=len(dictionary.keys()))
def process_lda_matrix(self): self.lda = models.ldamulticore.LdaMulticore.load( 'path_pre_process/lda-model') index = similarities.SparseMatrixSimilarity(self.lda[self.corpus], num_features=len( self.data_dictionary)) index.save('path_pre_process/lda_matrix')
def get_sim(self,all_anwser_list,question): anwser_list = [] for doc in all_anwser_list: anwser1 = str(doc[3]) + '的' cut_list1 = [word for word in jieba.cut(anwser1)] anwser_list.append(cut_list1) anwser2 = str(doc[6]) + '的' cut_list2 = [word for word in jieba.cut(anwser2)] anwser_list.append(cut_list2) stopwords = self.stopwordslist() doc_list = [] for sentence in anwser_list: l = [] for word in sentence: if (word not in stopwords) and (word != '\t'): l.append(word) doc_list.append(l) question_list = [word for word in jieba.cut(question)] doc_question_list = [] for word in question_list: if (word not in stopwords) and (word != '\t'): doc_question_list.append(word) dictionary = corpora.Dictionary(doc_list) corpus = [dictionary.doc2bow(doc) for doc in doc_list] doc_question_vec = dictionary.doc2bow(doc_question_list) tfidf = models.TfidfModel(corpus) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys())) sim = index[tfidf[doc_question_vec]] return sim
def get_similar_paper(target, number=3): # Use TF-IDF to recommend the user to read another paper. print('crawling...') doc_test = get_abstract(target) test_doc_list = [] for i in doc_test.split(' '): test_doc_list.append(i) all_doc_list = [] for i in all_doc: doc_list = [] for j in i.split(' '): doc_list.append(j) all_doc_list.append(doc_list) print('analyzing...') mydict = corpora.Dictionary(all_doc_list) corpus = [] for i in all_doc_list: corpus.append(mydict.doc2bow(i)) test_vec = mydict.doc2bow(test_doc_list) tfidf = models.TfidfModel(corpus) index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len( mydict.keys())) sim = index[tfidf[test_vec]] res = [] for i in sorted(enumerate(sim), key=lambda item: item[-1], reverse=True): res.append(i[0]) similar_links = [] for i in res[0:number]: similar_links.append(links_dict[i]) return similar_links
def tf_idf(self, doc_bow): """ 根据此类的文档集来检测文档相似度 :param doc_bow: :return: """ # initialize a model tfidf = models.TfidfModel( dictionary=self.dictionary) # 初始化tf-idf模型, corpus 作为语料库 # 使用tfidf模型将自身的词库转换成tf-idf表示 corpus_tfidf = tfidf[self.corpus] for doc in corpus_tfidf: print(doc) # 使用模型tfidf,将doc_bow(由词,词频)表示转换成(词,tfidf)表示 print(tfidf[doc_bow]) # 检查和每个文档的相似度 index = similarities.SparseMatrixSimilarity(tfidf[self.corpus], num_features=len( self.dictionary)) sims = index[doc_bow] print(sims)
def creat_index(self): self.tfidf = models.TfidfModel(self.corpus) # self.lsi = models.LsiModel( # self.tfidf[self.corpus], id2word=self.dic, num_topics=50 # ) self._index = similarities.SparseMatrixSimilarity( self.tfidf[self.corpus], num_features=200000)
def similarity(sent, topN=10): corpus_lines = read_corpus(ner_result_path) texts = [line.split("\t")[0].split(' ') for line in corpus_lines] keywords = one_ner_tag(sent) dictionary = corpora.Dictionary(texts) num_features = len(dictionary.token2id) corpus = [dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(corpus) new_vec = dictionary.doc2bow(keywords) # 相似度计算 index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features) # index = similarities.Similarity('-Similarity-index', corpus, num_features) # print('\nTF-IDF模型的稀疏向量集:') # for i in tfidf[corpus]: # print(i) # print('\nTF-IDF模型的keyword稀疏向量:') # print(tfidf[new_vec]) sims = index[tfidf[new_vec]] sims = sorted(enumerate(sims), key=lambda item: -item[1]) print("\n相似度计算") print('Words: {}\nText: {}\n'.format(keywords, sent)) for k, v in sims[:topN]: i = int(k) print('Similarity: {}\nWords: {}\nText: {}'.format( v, corpus_lines[i].split("\t")[0].split(' '), corpus_lines[i].split("\t")[1]))
def get_similarity(all_doc_list, doc_test_list): """ 1. 使用corpora.Dictionary(建立分词与编号的定义库)->使用dictionary.doc2bow(建立每个文档的分词-索引标号-数量 向量) 2. 使用models.TfidfModel(通过分词-索引标号-数量 向量,建立TFidfModel,得到每个all_doc_vec的TF-IDF值)->剔除TF-IDF值较低的 3. 通过文档的TF-IDF值建立相似度比较矩阵对象index,将测试文档的doc_test_vec代入index对象得到相似度值,并进行排序 :param all_doc_list: 输入的文字 :param doc_test_list: 需要测试相似度的样本文字 :return:相关度排序 """ # 1. dictionary = corpora.Dictionary(all_doc_list) print(dictionary.keys()) # 获取词的编号 print(dictionary.token2id) # 编号与词之间的对应 all_doc_vec = get_bag_of_words(dictionary, all_doc_list) doc_test_vec = get_bag_of_words(dictionary, doc_test_list) # 取二元组向量 # 2. tfidf = models.TfidfModel(all_doc_vec) # 得到TF-IDF模型 doc_test_vec = eliminate_junk(tfidf, doc_test_vec) # 3. index = similarities.SparseMatrixSimilarity(tfidf[all_doc_vec], num_features=len( dictionary.keys())) sim = index[tfidf[doc_test_vec]] print("相似度:") print(sim) similarities_sort = sorted(enumerate(sim), key=lambda item: -item[1]) print("相似度排序:") print(similarities_sort) return similarities_sort