def siphon_best_match_from_set(sentence, set): corpora_documents = [] for item_text in set: item_seg = list(jieba_nlp.generate_jieba_cut(item_text)) corpora_documents.append(item_seg) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(text) for text in corpora_documents] tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] similarity = similarities.Similarity( BASE_DIR + '/vendor/dataset/gensim/Similarity-tfidf-index', corpus_tfidf, num_features=600) test_cut_raw_1 = list(jieba_nlp.generate_jieba_cut(sentence)) test_corpus_1 = dictionary.doc2bow(test_cut_raw_1) similarity.num_best = 2 test_corpus_tfidf_1 = tfidf_model[test_corpus_1] tfidf_simi = similarity[test_corpus_tfidf_1] lsi = models.LsiModel(corpus_tfidf) corpus_lsi = lsi[corpus_tfidf] similarity_lsi = similarities.Similarity( BASE_DIR + '/vendor/dataset/gensim/Similarity-LSI-index', corpus_lsi, num_features=400, num_best=2) test_cut_raw_3 = list(jieba_nlp.generate_jieba_cut(sentence)) test_corpus_3 = dictionary.doc2bow(test_cut_raw_3) test_corpus_tfidf_3 = tfidf_model[test_corpus_3] test_corpus_lsi_3 = lsi[test_corpus_tfidf_3] # lsi.add_documents(test_corpus_lsi_3) #更新LSI的值 lsi_simi = similarity_lsi[test_corpus_lsi_3] return {'tfidf': tfidf_simi, 'lsi_simi': lsi_simi}
def CalSim(self, test_document, Type, best_num): '''Calculate similarities between test document wth all news(articles/documents). # Arguments: test_document: List of raw documents. Type: Models of calculating similarities. best_num: refer to 'num_best' parameter in Gensim module. ''' if Type == 'Similarity-tfidf-index': tfidf = models.TfidfModel(self._BowVecOfEachDoc) tfidfVec = tfidf[self._BowVecOfEachDoc] self._num_features = len(self._dictionary.token2id.keys()) self._similarity = similarities.Similarity(Type, tfidfVec, \ num_features=self._num_features,num_best=best_num) test_cut_raw = list(jieba.cut(test_document)) test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw) self._test_BowVecOfEachDoc = tfidf[test_BowVecOfEachDoc] elif Type == 'Similarity-LSI-index': lsi_model = models.LsiModel(self._BowVecOfEachDoc) corpus_lsi = lsi_model[self._BowVecOfEachDoc] self._num_features = len(self._dictionary.token2id.keys()) self._similarity = similarities.Similarity(Type, corpus_lsi, \ num_features=self._num_features,num_best=best_num) test_cut_raw = list(jieba.cut(test_document)) test_BowVecOfEachDoc = self._dictionary.doc2bow(test_cut_raw) self._test_BowVecOfEachDoc = lsi_model[test_BowVecOfEachDoc] self.Print_CalSim() IdLst = [] SimRltLst = [] SimTxLst = [] for Id, Sim in self._similarity[self._test_BowVecOfEachDoc]: IdLst.append(Id) SimRltLst.append(Sim) SimTxLst.append(self._raw_documents[Id]) return IdLst, SimTxLst, SimRltLst
def main(): print("Building dictionary...") dictionary = corpora.Dictionary(doc[1] for doc in TrecReader(config.filenames)) print("Generating LSI model...") corpus = TrecCorpus(config.filenames, dictionary) lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=config.num_topics) print("Building index...") index_tempfile = get_tmpfile(config.tmpfile) index = similarities.Similarity(index_tempfile, lsi[corpus], num_features=lsi.num_topics) topics = list(get_topics(config.topicfile, config.topic_type)) print("Evaluating LSI model...") evaluate(corpus, topics, dictionary, index, lsi, config.lsi_file, config.lsi_run, config.processor, config.settings_file) print("Generating LDA model...") lda = models.LdaModel(corpus, id2word=dictionary, num_topics=config.num_topics) print("Building index...") index_tempfile = get_tmpfile(config.tmpfile) index = similarities.Similarity(index_tempfile, lda[corpus], num_features=lda.num_topics) print("Evaluating LDA model...") evaluate(corpus, topics, dictionary, index, lda, config.lda_file, config.lda_run, config.processor, config.settings_file) if parmenides_processor: cleanup()
def create_index(corpus_path, output_path, model_path, lda=False, lsi=False, tfidf=False, hdp=False): """Creates an index specified by the parameters & saves to output directory Parameters: corpus_path: the path to the corpus directory (os.path) output_path: the directory path where index(s) will be saved (os.path) Note indexes each need their own folder model_path: the directory path with the models to be used (os.path) The model path should have a corpus.dict and corpus.mm too Use create_models.py name: the name of the index (str) lda: if True will create an index based on the lda model (boolean) lsi: if True will create an index based on the lsi model (boolean) tfidf: if True will create an index based on the tfidf model (boolean) hdp: if True will create an index based on hdp model (boolean) """ dictionary = corpora.Dictionary.load(os.path.join(model_path, "corpus.dict")) mc = corpora.MmCorpus(os.path.join(model_path, "corpus.mm")) # depending on the model the number of features changes tfidf_model = models.TfidfModel.load(os.path.join(model_path, "model.tfidf")) if tfidf: op = os.path.join(output_path, "tfidf") index = similarities.Similarity(op, tfidf_model[mc], num_features=len(dictionary)) index.save(os.path.join(output_path, "index.tfidf")) if lda: model = models.LdaModel.load(os.path.join(model_path, "model.lda")) op = os.path.join(output_path, "lda") index = similarities.Similarity(op, model[mc], num_features=model.num_topics) index.save(os.path.join(output_path, "index.lda")) if lsi: model = models.LsiModel.load(os.path.join(model_path, "model.lsi")) op = os.path.join(output_path, "lsi") index = similarities.Similarity(op, model[tfidf_model[mc]], num_features=model.num_topics) index.save(os.path.join(output_path, "index.lsi")) if hdp: model = models.HdpModel.load(os.path.join(model_path, "model.hdp")) op = os.path.join(output_path, "hdp") index = similarities.Similarity(op, model[mc], num_features=model.m_T) index.save(os.path.join(output_path, "index.hdp"))
def gensim_tfidf_simi(sentence, features=400, best=2): """ 文本相似度匹配,存在未数据未对齐异常 :param sentence: :param features: :param best: :return: """ rows = decode_rows_pickle() corpora_documents = [] for row in rows: item = list(jieba_nlp.generate_jieba_cut(row)) corpora_documents.append(item) dictionary = corpora.Dictionary(corpora_documents) dictionary.compactify() corpus = [dictionary.doc2bow(text) for text in corpora_documents] tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] similarity = similarities.Similarity( BASE_DIR + '/vendor/dataset/gensim/Similarity-tfidf-index', corpus_tfidf, num_features=features, num_best=best) cut_raw = list(jieba_nlp.generate_jieba_cut(sentence)) test_corpus = dictionary.doc2bow(cut_raw) test_corpus_tfidf = tfidf_model[test_corpus] return similarity[test_corpus_tfidf]
def create_similarity_index(_type,_path): print " creating similarity index %s"%_type print " loading corpus from %s"%_path corpus=corpora.MmCorpus(_path+"/"+_type+".mm") print "",corpus # get number of features from dictionnary length # if (_type == "txt"): # dic_path=_path+"/txt.dict" # else: dic_path=_path+"/"+_type+".dict" _num_features=len(corpora.Dictionary.load(dic_path)) print " getting similarity computed for %d"%_num_features file_path=_path+"/"+_type+".index" # build the index index = similarities.Similarity(file_path, corpus, num_features=_num_features) index.save(file_path) print " similarities saved as %s"%file_path print return index
def CorpusCluster(indexpath, corpuspath, dirpath, threshold, corpusday): global corekeys global indexlist indexlist = ReadColumn(indexpath, 0) # Read Index of Seeds print "indexlist == " print indexlist # Read Corpus when needed ! class ReadCorpus(object): def __iter__(self): for line in open(corpuspath): yield line.split() corp = ReadCorpus() # Read corpus as Corp dictionary = corpora.Dictionary(corp) # Create dictionary corpus = [dictionary.doc2bow(text) for text in corp] # get bag-of-word tfidf = models.TfidfModel(corpus) # Create TF-IDF Model corpus_tfidf = tfidf[corpus] # Calculate Similarities index = similarities.Similarity(dirpath + '/index', corpus, num_features=len(dictionary)) i = 0 dictcluster = {} for text in corpus_tfidf: dict_tmp = {} sims_tmp = list(index[text]) #print sims_tmp # Use Threshold carefully ! dict_tmp[int(indexlist[i])] = [ int(indexlist[k]) for k, x in enumerate(sims_tmp) if x >= (max(sims_tmp) - threshold) ] #print dict_tmp if i == 0: dictcluster = dict_tmp # Initialization Dictionary Cluster else: dictcluster = DictUnin(dictcluster, dict_tmp) #Update Dictionary Cluster i += 1 #print dictcluster print "Dict_Cluster" print dictcluster corekeys = dictcluster.keys() initialindex = Txt2Dict(indexpath) print "initialindex" print initialindex updateindex = UpdateCluter(dictcluster, initialindex) print "updateindex" print updateindex corekeys = updateindex.keys() #print "updateindex +++++ " #print updateindex Dict2Txt2(updateindex, indexpath) # Output seedindex into txt WriteSeedCorpus(corpuspath, dirpath + '/seedcorpus_' + rundate + '.txt') # Output seedcorpus into txt
def disambiguate_by_text_sim(validate_data, corr=0.3): res_dict = {} print('不同名作者数', len(validate_data)) for i, author in enumerate(validate_data.keys()): author_papers = validate_data[author] if len(author_papers) == 0: res_dict[author] = [] # print(i,author,len(author_papers)) else: paper_words = get_papar_words(author_papers) dictionary = corpora.Dictionary(paper_words) bow_corpus = [dictionary.doc2bow(wl) for wl in paper_words] # 语料向量化 tfidf = models.TfidfModel(bow_corpus) # 基于向量化的语料构建tfidf模型 index = similarities.Similarity('E:\\gensim_test', tfidf[bow_corpus], len(dictionary)) sims = index[tfidf[bow_corpus]] # 计算相似性矩阵 i_cluster = graph_sim_matrix(sims, corr) author_cluster = [[ author_papers[index]['id'] for index in l_inside ] for l_inside in i_cluster] # res_realx={} # res_realx[author]=res_real[author] # print(author,'pairwise-f1',pairwise_f1(res_realx,{author:author_cluster})) print(i, author, '文章数', len(author_papers), '消歧后作者数', len(author_cluster)) res_dict[author] = author_cluster return res_dict
def init_similarity(self): #l = (self.lenth()//3) if (self.lenth()//3)<2 else 2 #print("\n> l=%s"%l) self.similarity = similarities.Similarity('Similarity-tfidf-index', self.corpus_tfidf, num_features=900) self.similarity.num_best = 5
def similarity(self): corpora_documents = [] stopwords = {}.fromkeys( [line.rstrip() for line in open('chineseStopWords.txt')]) # 文本处理 for item_text in self.documents: item_seg = list(jieba.cut(item_text)) #分词 words = [] for seg in item_seg: if seg not in stopwords: words.append(seg) #去停词 corpora_documents.append(words) # #生成字典和向量语料 dictionary = corpora.Dictionary(corpora_documents) # 通过下面一句得到语料中每一篇文档对应的稀疏向量(这里是bow向量) corpus = [dictionary.doc2bow(text) for text in corpora_documents] #corpus是一个返回bow向量的迭代器。下面代码将完成对corpus中出现的每一个特征的IDF值的统计工作 tiidf_model = models.TfidfModel(corpus) corpus_tfidf = tiidf_model[corpus] self.sim = similarities.Similarity('Similarity-tfidf-index', corpus_tfidf, num_features=600) self.sim.num_best = self.top_num #如果等于3,则返回最相似的3个结果 sentence_cut_temp = list(jieba.cut(self.sentence)) sentence_cut = [] for word in sentence_cut_temp: if word not in stopwords: sentence_cut.append(word) sentence_cut_corpus = dictionary.doc2bow(sentence_cut) self.sentence_sim = tiidf_model[sentence_cut_corpus] self.resultShow()
def fit(self,corpus): """ Fit a document similarity model Parameters ---------- corpus : object a corpus object that follows DefaultJsonCorpus Returns ------- trained DocumentSimilarity object """ if self.model_type == 'sklearn_nmf': model = self.create_sklearn_model(corpus) else: model = self.create_gensim_model(corpus) self.index = similarities.Similarity(self.work_folder+"/gensim_index",model,self.vec_size) self.index_annoy = annoy.AnnoyIndex(self.vec_size, metric='angular') for i, vec in enumerate(model): self.index_annoy.add_item(i, list(gensim.matutils.sparse2full(vec, self.vec_size).astype(float))) self.index_annoy.build(self.annoy_trees) self.seq2meta = {} self.id2meta = {} for j in corpus.get_meta(): self.seq2meta[j['corpus_seq_id']] = j self.id2meta[j['id']] = j return self
def generateSimilarityIndex(corpus, num_topics=100): ############################################################## # Create TFIDF and LSI Models on the corpus ############################################################## tfidfModel = models.TfidfModel(corpus) corpus_tfidf = tfidfModel[corpus] # Reduce to 100 dimensions lsiModel = models.LsiModel( corpus_tfidf, id2word=dictionary, num_topics=num_topics) # initialize an LSI transformation # lsi_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=10) # initialize an LSI transformation # lsi_model = models.HdpModel(corpus_tfidf, id2word=dictionary) # initialize an LSI transformation corpus_lsi = lsiModel[ corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi # Print the topics generated by the lsi model lsiModel.print_topics() # Create an index on which if we hit we will get the # similarity of the hitting object with all these documents # in LSI space # In memory computation - better for small datasets that fit in memory # index = similarities.MatrixSimilarity(corpus_lsi) # transform corpus to LSI space and index it # Non In memory computation - better for big datasets that dont fit in memory index_temp = get_tmpfile( "lsimodel") # create a temporary file named lsimodel to save things # Note that you need to give num_topics again here as num_features index = similarities.Similarity( index_temp, corpus_lsi, num_features=num_topics) # transform corpus to LSI space and index it return [index, tfidfModel, lsiModel]
def fit(self, db): ''' INPUT: connection to database with recipes, restaurants data OUTPUT: fit model, index Creates a dictionary and model for recommender system. Given database connection, find all recipe ingredient lists, vectorize, build corpus and dictionary, fit model and create index. ''' documents = self._prepare_documents(db) texts = self._clean_text(documents) if self.model.__init__.im_class == models.tfidfmodel.TfidfModel: # Vectorize and store recipe text self.dictionary = corpora.Dictionary(texts) # convert to BOW self.corpus = [self.dictionary.doc2bow(text) for text in texts] for i in self.dictionary.iterkeys(): self.dictionary_len += 1 self.model = self.model(self.corpus) # prepare for similarity queries self.index = similarities.SparseMatrixSimilarity( self.model[self.corpus], num_features=self.dictionary_len) else: # word2vec self.model = models.Word2Vec.load('/mnt/word2vec/words', mmap='r') doc_vectors = self._create_doc_vectors(texts) self.index = similarities.Similarity('/mnt/word2vec/index', doc_vectors, num_features=300)
def __calcSim(self,pic=0.02,process=0): # time0=time.time() self.tfidf_model = models.TfidfModel(self.new_corpus) self.corpus_tfidf = self.tfidf_model[self.new_corpus] sim_t=similarities.Similarity('Similarity-tfidf-index', self.corpus_tfidf, num_features=1000000,num_best=200) i=0 maxAve = 0 maxIndex = 0 # print(["进程:",process,len(self.source_seg),len(sim_t),self.title,self.trade,self.classes]) for n in sim_t: try: # ns = [(str(kk[0]),kk[1]) for kk in n if kk[1]>pic] cl = [kk[1] for kk in n if kk[1]>pic] ave = 0 if len(cl)>0: ave = stat.mean(cl) # print("sim_t -------:第%s个 平均值:%s 时间:%s " % (str(i),ave,str(time.time()-time0))) #最短关键字要求 if self.source_seg[i] and "dseg" in self.source_seg[i].keys(): key_num = len(self.source_seg[i]["dseg"]) else: key_num=0 #计算最大相似度及其Index if key_num > 25 and ave > maxAve : maxAve = ave maxIndex = i i+=1 except: print(["segment erorr:",i,self.source_seg[i]["_id"]]) continue self.max_ave = maxAve self.max_index = maxIndex return None
def tcutword(data,stopword): corpora_documents = [] for i in data.index: text = data.loc[i].values[0].strip() text = re.sub('[\"*\【\】\[\]\s*]','',text) # sub Special symbol text =re.sub('\([a-zA-z]+://[^\s]*\)','',text) # substitute URL text = re.sub('\d+\.*\d*','',text) text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。?、~@#¥%……&*()]+", '',text) #cuting = jieba.cut(text) #cuting = ' '.join(cuting) temp = list(jieba.cut(text,HMM=True)) #temp=thu1.cut(text,text=True).split() word_list = temp ''' word_list = [] for word in temp: if word not in stopword: word_list.append(word) #text = ' '.join(temp) ''' corpora_documents.append(word_list) dictionary = corpora.Dictionary(corpora_documents) corpus = [dictionary.doc2bow(ttext) for ttext in corpora_documents] similarity = similarities.Similarity('-Similarity-index', corpus, num_features=99999999) return dictionary,similarity
def sim_eval(text, label_text): # label_test 为相似性模型建立的语料标签 print('导入已训练相似性相关模型......') dictionary = corpora.Dictionary.load( '../resource/model/similarity/dict.txt') tfidf_model = joblib.load('../resource/model/similarity/tfidf_model.mm') lsi = models.LsiModel.load('../resource/model/similarity/lsi.mm') corpus_tfidf = joblib.load('../resource/model/similarity/corpus_tfidf.mm') test_raw = [] test_raw.append(list(jieba.cut(text))) # print(test_raw) test_corpus = [dictionary.doc2bow(item) for item in test_raw] test_corpus_tfidf = tfidf_model[test_corpus] test_corpus_lsi = lsi[test_corpus_tfidf] # 计算lsi值 # print(test_corpus_lsi) corpus_lsi = lsi[corpus_tfidf] similaritiy_lsi = similarities.Similarity('Similarity-LSI-index', corpus_lsi, num_features=200, num_best=2) query = similaritiy_lsi[test_corpus_lsi][0] if len(query) < 1: query = [(0, 0), (0, 0)] if label_text[query[0][0]] == label_text[query[1][0]]: sim_estimate = label_text[query[0][0]] else: sim_estimate = 'null' similarity_score = str(query[0][1])[0:5] print('相似性判断完成......') return sim_estimate, similarity_score
def __init__(self, cleanup_urls=True, nltk_tokenizer=False): super().__init__(cleanup_urls=cleanup_urls, nltk_tokenizer=nltk_tokenizer) self.corpus = [] for bug in bugzilla.get_bugs(): textual_features = self.text_preprocess(self.get_text(bug)) self.corpus.append([bug["id"], textual_features]) # Assigning unique integer ids to all words self.dictionary = Dictionary(text for bug_id, text in self.corpus) # Conversion to BoW corpus_final = [self.dictionary.doc2bow(text) for bug_id, text in self.corpus] # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions tfidf = models.TfidfModel(corpus_final) corpus_tfidf = tfidf[corpus_final] # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing self.lsi = models.LsiModel( corpus_tfidf, id2word=self.dictionary, num_topics=300 ) corpus_lsi = self.lsi[corpus_tfidf] # Indexing the corpus self.index = similarities.Similarity( output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300 )
def get_sim(col_id): compare_all = [] stop_words = set(stopwords.words("english")) stemmer = SnowballStemmer("english") for i in range(1, 323): compare = query_db( """Select * from countries where id = {}""".format(i)) compare_all.append(compare[col_id]) gen_doc_new = [[ stemmer.stem(word) for word in document.lower().split() if (word not in stop_words) ] for document in compare_all] dict = gensim.corpora.Dictionary(gen_doc_new) corpus = [dict.doc2bow(i) for i in gen_doc_new] tf_idf = gensim.models.TfidfModel(corpus) corpus_tfidf = tf_idf[corpus] ## calculation of best topic sice #numpy_matrix = gensim.matutils.corpus2dense(corpus, num_terms=65764) #print(numpy_matrix) #print("XXXXXXXXXXXXXXXXXXXXX") #s = np.linalg.svd(numpy_matrix, full_matrices=False, compute_uv=False) #print(s) #print("XXXXXXXXXXXXXXXXXXXXX") #plt.figure(figsize=(10, 5)) #plt.hist(s[0], bins=100) #plt.xlabel('Singular values', fontsize=12) #plt.show() lsa = models.LsiModel(corpus_tfidf, id2word=dict, num_topics=95) index = similarities.Similarity(path, lsa[corpus_tfidf], num_features=len(dict)) return index, tf_idf, dict
def graph_model(author_paper_list, text_sim=0.15, co_num=4): # 同名作者的paper id列表 paper_id = [paper['id'] for paper in author_paper_list] # 构造图模型 graph = nx.Graph() graph.add_nodes_from(paper_id) # 有共同作者的聚在一起 for index1, p1 in enumerate(author_paper_list): if index1 == len(author_paper_list) - 1: break for index2, p2 in enumerate(author_paper_list[index1 + 1:]): num_co_au = num_coauthor_paper(p1, p2) if num_co_au >= co_num: graph.add_edge(p1['id'], p2['id']) # 有相同文章主题的聚在一起 if len(author_paper_list) == 0: pass else: paper_words = get_papar_words(author_paper_list) dictionary = corpora.Dictionary(paper_words) bow_corpus = [dictionary.doc2bow(wl) for wl in paper_words] # 语料向量化 tfidf = models.TfidfModel(bow_corpus) # 基于向量化的语料构建tfidf模型 index = similarities.Similarity('E:\\gensim_test', tfidf[bow_corpus], len(dictionary)) sim_matrix = index[tfidf[bow_corpus]] # 计算相似性矩阵 # 文章之间的相似度超过给定阈值的建立连接(归为一类) for i in range(0, sim_matrix.shape[0]): if i == sim_matrix.shape[0] - 1: break for j in range(0, sim_matrix.shape[1]): if j <= i: continue if sim_matrix[i][j] > text_sim: graph.add_edge(paper_id[i], paper_id[j]) # 计算联通子图结果 conn_comp = list(nx.connected_components(graph)) conn_comp = [list(c) for c in conn_comp] return conn_comp
def main(): try: corpus = corpora.MmCorpus(file_tfidf + '.corpus') dictionary = corpora.Dictionary.load(file_tfidf + '.dict') modelTfidf = models.TfidfModel.load(file_tfidf + '.modelTfidf') except Exception: print 'TFIDF not found. please run buildindex.py first' try: index = cPickle.load(open(file_tfidf + '.index', 'rb')) except Exception: print 'using new index' modelTfidfCorpus = modelTfidf[corpus] index = similarities.Similarity( file_tfidf + '.modelTfidfindex', modelTfidfCorpus, num_features=modelTfidfCorpus.corpus.num_terms) index.num_best = None cPickle.dump(index, open(file_tfidf + '.index', 'wb')) # evaluate for each query scores = [] for i in range(len(labels_choices)): print 'for label ', i query_vector = modelTfidf[dictionary.doc2bow([ '<phrase>%s</phrase>' % w.lower() for w in seed_concepts_list[i] ])] scores.append(index[query_vector]) scores = np.array(scores).T with open(categorization_file, 'w') as f: for i in range(scores.shape[0]): f.write('%s\n' % (' '.join(['%s' % d for d in scores[i]])))
def calcCorpusTFIDFSimilarity(new_ids, all_ids, corpus, new_corpus, num_feature=400): # calc IFTDF model tfidf = models.TfidfModel(corpus) crp_tfidf = tfidf[corpus] new_tfidf = tfidf[new_corpus] logger.info('corpus tfidf length: %i' % len(crp_tfidf)) # create index index = similarities.Similarity(fp_index, crp_tfidf, num_features=num_feature, num_best=NUM_BEST_SIM_DOC) # similarity docs_sims = index[new_tfidf] index.save(fp_index) for idx, doc_sim in enumerate(docs_sims): cur_doc_id = new_ids[idx] logger.info( u'calc text similarity | cur_doc_id: %i | sorted_sims(0:5): %s' % (cur_doc_id, doc_sim[0:3]))
def bow(): # print("row", row[i]) texts = [[word for word in jieba.cut(document, cut_all=True)] for document in data] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(t) for t in texts] print(corpus) # TF-IDF特徵值 tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] #在TFIDF的基础上,进行相似性检索, 然后进行similarity检索。 similarity = similarities.Similarity('Similarity-tfidf-index', corpus_tfidf, num_features=10000) # new_sensence = "昨日金融期則是下跌17.4點,指數為1,051點,跌幅1.63%。股價皆重挫3~4%,回到年線的位置。工商,2015/01/07 00:00:00,外資變臉 電子期重摔 昨日下跌8.7點,跌幅達2.35%,為各期指中最弱,1" # test_corpus_1 = dictionary.doc2bow(jieba.cut(new_sensence, cut_all=True)) # vec_tfidf = tfidf[test_corpus_1] # print(vec_tfidf) # print(similarity[test_corpus_1]) # 返回最相似的样本材料,(index_of_document, similarity) tuples for item in corpus_tfidf: print(item) tfidf.save("data.tfidf") tfidf.save("data_tfidf.txt") tfidf.save("data_tfidf.csv") tfidf = models.TfidfModel.load("data.tfidf") print(tfidf)
def calc_similarity(self, prefix: str, text: str): """计算相似度 返回索引和余弦值 Arguments: prefix {str} -- 模型前缀 text {str} -- 文本数据 value {float} -- 设定的阈值,返回大于这个值的数据 """ dictionary = corpora.Dictionary.load('./models/{}_dict.dic'.format(prefix)) # 加载字典 corpus = corpora.MmCorpus('./models/{}_corpuse.mm'.format(prefix)) # 加载语料 tfidf_model = models.TfidfModel.load("./models/{}_tfidf_model.model".format(prefix)) # 加载Tfidf模型 corpus_tfidf = tfidf_model[corpus] lsi = models.LsiModel(corpus_tfidf) corpus_lsi = lsi[corpus_tfidf] similarity_lsi = similarities.Similarity('./models/similarity-lsi-index', corpus_lsi, num_features=400, num_best=3) cut_raw = self.segment(text) # 1.分词 corpus = dictionary.doc2bow(cut_raw) # 2.转换成bow向量 corpus_tfidf = tfidf_model[corpus] # 3.计算tfidf值 corpus_lsi = lsi[corpus_tfidf] # 4.计算lsi值 sims = similarity_lsi[corpus_lsi] with open('./data/idx_dic.dic', 'r') as f: dt = f.read() idx_dic = eval(dt) result = [] if sims is not None: result = [idx_dic[idx] for idx, val in sims if val > self.keep_val] return result
def classify(k=None): corpora_documents = [] f = codecs.open("result.txt", 'r', encoding="utf-8").readlines() f2 = codecs.open("label_level1.txt", 'r', encoding="utf-8").readlines() for li in f[:13000]: li = li.split() corpora_documents.append(li) #生成字典和向量语料 dictionary = corpora.Dictionary(corpora_documents) #dictionary.save('dictionary.dict') corpus = [dictionary.doc2bow(text) for text in corpora_documents] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] index1 = similarities.Similarity('-Similarity-index', corpus_tfidf, num_features=25000) #index1.save('documents.index') #test_data=[] count = 0 for i, li in enumerate(f[13000:]): li = li.split() test_corpus_1 = dictionary.doc2bow(li) #k=11 index1.num_best = k index = index1[tfidf[test_corpus_1]] print(index) predictions = {} for m, n in index: for key in f2[m].split(): if key not in predictions.keys(): predictions[key] = 1 else: predictions[key] += 1 true_label = f2[13000 + i].split() prediction = [] for key in predictions.keys(): if predictions[key] >= k / 2: prediction.append(key) true_label.sort() prediction.sort() print("true label:", true_label) if len(prediction) == 0: dict_sorted = sorted(predictions.items(), key=lambda x: x[1], reverse=True) prediction.append(dict_sorted[0][0]) if true_label == prediction: count += 1 print(1) elif true_label == prediction: count += 1 print(1) else: print(0) print("predict", prediction) print(predictions) print("#####################################") print("count:", count)
def make_index(self, seqs): print "building index for sequences" #import pdb;pdb.set_trace() if self.use_lsi: if self.use_tfidf: seqs = (self.lsi_model[self.tfidf_model[self.lexicon.doc2bow( tokenize(seq))]] for seq in seqs) else: seqs = (self.lsi_model[self.lexicon.doc2bow(tokenize(seq))] for seq in seqs) num_features = self.lsi_model.num_topics else: if self.use_tfidf: seqs = (self.tfidf_model[self.lexicon.doc2bow(tokenize(seq))] for seq in seqs) else: seqs = (self.lexicon.doc2bow(tokenize(seq)) for seq in seqs) num_features = len(self.lexicon.keys()) self.index = similarities.Similarity(output_prefix=self.index_filepath, corpus=None, num_features=num_features) self.index.save(self.index_filepath) self.index.add_documents(seqs) self.index.save(self.index_filepath) print "saved index to", self.index_filepath
def __init__(self): self.corpus = [] for bug in bugzilla.get_bugs(): textual_features = "{} {}".format(bug["summary"], bug["comments"][0]["text"]) textual_features = text_preprocess(textual_features) self.corpus.append([bug["id"], textual_features]) # Assigning unique integer ids to all words self.dictionary = Dictionary(text for bug_id, text in self.corpus) # Conversion to BoW corpus_final = [ self.dictionary.doc2bow(text) for bug_id, text in self.corpus ] # Initializing and applying the tfidf transformation model on same corpus,resultant corpus is of same dimensions tfidf = models.TfidfModel(corpus_final) corpus_tfidf = tfidf[corpus_final] # Transform TF-IDF corpus to latent 300-D space via Latent Semantic Indexing self.lsi = models.LsiModel(corpus_tfidf, id2word=self.dictionary, num_topics=300) corpus_lsi = self.lsi[corpus_tfidf] # Indexing the corpus self.index = similarities.Similarity(output_prefix="simdata.shdat", corpus=corpus_lsi, num_features=300)
def d2b(): #读取文件 raw_documents=[] a=os.listdir("C:\Users\mxf\Desktop\docsim-master\\traindata") a.sort(key= lambda x:int(x[:-4])) # print a for name in a: f = open(os.path.join("C:\Users\mxf\Desktop\docsim-master\\traindata", name), 'r') # raw = str(os.path.join(root, name))+" " raw="" raw += f.read() # raw即文档内容 raw_documents.append(raw) # 去除停用词 stop = [line.strip().decode('utf-8') for line in open('stopwordd2b.txt').readlines() ] #创建语料库 corpora_documents = [] for item_text in raw_documents: item_str=[] item= (pseg.cut(item_text)) #使用jieba分词 for i in list(item): item_str.append(i.word) item_str=a_sub_b(item_str,list(stop)) corpora_documents.append(item_str) # 生成字典和向量语料 dictionary = corpora.Dictionary(corpora_documents) #把所有单词取一个set,并对set中每一个单词分配一个id号的map corpus = [dictionary.doc2bow(text) for text in corpora_documents] #把文档doc变成一个稀疏向量,[(0,1),(1,1)]表明id为0,1的词出现了1次,其他未出现。 similarity = similarities.Similarity('-Similarity-index11', corpus, num_features=len(dictionary)) f=open('test_data.txt','r') fa=f.readlines() dt=dict() for li in fa: print li test_data_1=li.split('\n')[0].split(',') ind=test_data_1[0] test_cut = pseg.cut(test_data_1[1]) test_cut_raw_1=[] for i in list(test_cut): test_cut_raw_1.append(i.word) test_corpus_1 = dictionary.doc2bow(test_cut_raw_1) # 返回前101条记录,为了尽可能计算两种方法返回的TOP100的值 similarity.num_best = 101 print('################################') for i in similarity[test_corpus_1]: sim="" for j in corpora_documents[i[0]]: sim+=j ind2=i[0]+1 print ind,i[0]+1,i[1] #2784对应的句子所在的序号:i[0]+1 # i[1]序号对应的分值 # 如2784,2784,1.0 if(ind==ind2): print "same" else: data_pro.addtodict2(dt, int(ind), int(ind2), i[1]) return dt
def similarity_vec_lsi(self): lsi = models.LsiModel(self._doc_tf_idf_vec) doc_lsi = lsi[self._doc_tf_idf_vec] similarity_vec_lsi = similarities.Similarity("Similarity-LSI-index", corpus=doc_lsi, num_features=len( self._dictionary)) return lsi, similarity_vec_lsi
def ge_process(raw_documents): corpora_documents = [] #分词处理 for item_text in raw_documents: item_seg = list(jieba.cut(item_text)) corpora_documents.append(item_seg) # 生成字典语料 dictionary = corpora.Dictionary(corpora_documents) # 词频统计,稀疏表达方式,实际上产生的是16*384的词频矩阵,16是文档数目,384是词语数目 #dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000) # 1.去掉出现次数低于no_below的 # 2.去掉出现次数高于no_above的。注意这个小数指的是百分数 # 3.在1和2的基础上,保留出现频率前keep_n的单词 corpus = [dictionary.doc2bow(text) for text in corpora_documents] # 计算TF-IDF tfidf_model = models.TfidfModel(corpus) corpus_tfidf = tfidf_model[corpus] # 计算相似度 similarity = similarities.Similarity('Similarity-tfidf-index', corpus_tfidf, num_features=600) test_data_1 = '北京雾霾红色预警' test_cut_raw_1 = list(jieba.cut(test_data_1)) # ['北京', '雾', '霾', '红色', '预警'] test_corpus_1 = dictionary.doc2bow(test_cut_raw_1) # [(51, 1), (59, 1)],即在字典的56和60的地方出现重复的字段,这个值可能会变化 # 定义相似样本数量N similarity.num_best = 5 test_corpus_tfidf_1=tfidf_model[test_corpus_1] # 根据之前训练生成的model,生成query的IFIDF值,然后进行相似度计算 # [(51, 0.7071067811865475), (59, 0.7071067811865475)] print(similarity[test_corpus_tfidf_1]) # [(2, 0.3595932722091675)] # 利用潜在语义序列计算相似度,先获得tf-idf(也可以直接使用bow向量) lsi = models.LsiModel(corpus_tfidf) corpus_lsi = lsi[corpus_tfidf] similarity_lsi=similarities.Similarity('Similarity-LSI-index', corpus_lsi, num_features=400,num_best=2) test_data_3 = '长沙街头发生砍人事件致6人死亡' test_cut_raw_3 = list(jieba.cut(test_data_3)) test_corpus_3 = dictionary.doc2bow(test_cut_raw_3) test_corpus_tfidf_3 = tfidf_model[test_corpus_3] test_corpus_lsi_3 = lsi[test_corpus_tfidf_3] print(similarity_lsi[test_corpus_lsi_3]) # 基于lda的主题模型 # 词频矩阵,(395L, 4258L),表示395个文档,4258个单词 X = lda.datasets.load_reuters() def lda_process(X)
def get_similarity(self, prepped_text): #take a prepped text, convert to LSI space vec_bow = self.dictionary.doc2bow(prepped_text) vec_lsi = self.model[vec_bow] #create the similarities, return the best 5 index = similarities.Similarity(self.corpus, num_features=10, num_best=6) sims = index[vec_lsi] return sims