def calculate_BM25(keywords, lst): #calculates the BM25 weights maxweight = -1000 results = [] stopWords = set(stopwords.words('portuguese')) for i in range(len(keywords)): keywords[i] = keywords[i].lower() new_keywords = [word for word in keywords if word not in stopWords] #print(new_keywords) corpus = create_corpus(lst) #print(corpus) bm25 = BM25.BM25(corpus) average_idf = sum(float(val) for val in bm25.idf.values()) / len( bm25.idf) #average idf #print(average_idf) scores = bm25.get_scores(new_keywords, average_idf) #bm25 scores #print(scores) for i in range(len(scores)): if scores[i] != 0: results = results + [[ scores[i], get_title(lst[i]), get_party(lst[i]) ]] #different than 0 scores results.sort(key=lambda document: document[0], reverse=True) #most relevant ones return results, new_keywords
def create_bm25(): """Function to create or update BM25 object""" # Run arguments parser = argparse.ArgumentParser() parser.add_argument("--task", default=4, type=int, help="Task where: \ -task 1 : classification subcat \ -task 2 : classification cat \ -task 3 : ner \ -task 4 : qa") parser.add_argument('--register_model', action='store_true', help="") parser.add_argument('--download_train', action='store_true', help="") args = parser.parse_args() # Load data cl = pr.Clean(task=args.task, download_train=args.download_train) data = cl.dt.load('fn_clean', dir='data_dir') # Split tokenized data toks = data.question_clean.apply(cl.transform_by_task).to_list() # Create BM25 Object bm = bm25.BM25(toks) # Dump objects with open(cl.dt.get_path('fn_rank', 'model_dir'), 'wb') as fp: pickle.dump(bm, fp) pickle.dump(data, fp) logger.warning('[INFO] Created and stored BM25 object.') # Upload if args.register_model: cl.dt.upload('model_dir', destination='model')
def train(self, knownDocuments): for root,dirs,files in os.walk(dirname): for f in knownDocuments: corpus.append(tokenization(f)) filenames.append(f) dictionary = corpora.Dictionary(corpus) doc_vectors = [dictionary.doc2bow(text) for text in corpus] vec1 = doc_vectors[0] bm25Model = bm25.BM25(corpus) average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys()) def analyze(self, unknownDocument): url_k_words = "articles/sanchongmen_ch5.txt" sentence = open(url_k_words,"rb").read() # Based on TF-IDF to get key words # tags = jieba.analyse.extract_tags(sentence, withWeight=True, topK=20, allowPOS=()) # Based on TextRank to get top K words tags =jieba.analyse.textrank(sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) query = tags scores = bm25Model.get_scores(query) # scores.sort(reverse=True) idx = scores.index(max(scores)) fname = filenames[idx] return fname def displayName(): return "BM25 Driver for Chinese"
def get_ranks_baseline(data, tables): texts, id_to_index = get_table_words(tables) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] bm25_obj = bm25.BM25(corpus) ranks, num_samples = [], len(data) for count, one_data in enumerate(data): id, query = one_data['table_id'], one_data['question_tokens'] query_doc = dictionary.doc2bow(query.split()) scores = bm25_obj.get_scores(query_doc) score_tuples = [(score, i) for i, score in enumerate(scores)] score_tuples.sort(reverse=True) target_index = id_to_index[id] rank = -1 for index, tup in enumerate(score_tuples): cur_index = tup[1] if cur_index == target_index: rank = index + 1 break if rank != -1: ranks.append(rank) else: print('Invalid baseline input') if count % 100 == 0: print('Done ' + str(count) + ' out of ' + str(num_samples) + ' inputs') return ranks
def get_fitness_answer(input_seq): # 通过bm25的相似度匹配问题 input_seq = get_final_input(input_seq) bm25Model = bm25.BM25(question_list) average_idf = sum( map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len( bm25Model.idf.keys()) scores = bm25Model.get_scores(input_seq, average_idf) sorted_scores = list(set(scores)) sorted_scores.sort() answer = [] question = [] if sorted_scores[-1] > 0: one = scores.index(sorted_scores[-1]) answer.append(get_key(question_answer_direct, question_list[one])) question.append(get_key(qa_dict, answer_list[one])) if sorted_scores[-2] > 0: two = scores.index(sorted_scores[-2]) answer.append(get_key(question_answer_direct, question_list[two])) question.append(get_key(qa_dict, answer_list[two])) if sorted_scores[-3] > 0: three = scores.index(sorted_scores[-3]) answer.append( get_key(question_answer_direct, question_list[three])) question.append(get_key(qa_dict, answer_list[three])) return answer, question
def buildModel(jsonFile, fieldNames, query_str): # iterable 不能循环两次,所以创建两个变量 t1 = jsonutil.iterCutFieldList(jsonFile, fieldNames) t2 = jsonutil.iterCutFieldList(jsonFile, fieldNames) # 建立单词索引字典 dictionary = corpora.Dictionary(t1) dictionary.save(DICTIONARY_PATH) # 建立词袋模型.将词汇表示的文本,转换成用id表示 corpus = [dictionary.doc2bow(text) for text in t2] print("词袋: %i " % len(corpus)) bm25Model = bm25.BM25(corpus) # print("bm25 idf lens: %i " %len(bm25Model.f)) average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys()) query = jiebautil.cutWords(query_str).split() query_bow = dictionary.doc2bow(query) scores = bm25Model.get_scores(query_bow, average_idf) # i = scores.index(max(scores)) lineRead = LineReader(jsonFile) for i in range(5): score = max(scores) lineNum = scores.index(score) + 1 s = lineRead.load(lineNum) j = json.loads(s) print(jsonutil.recursive_get(j, fieldNames[0])) del scores[lineNum-1]
def get_fitness_answer(input_seq): # 通过bm25的相似度匹配问题 input_seq = get_final_input(input_seq) bm25Model = bm25.BM25(question_list) scores = bm25Model.get_scores(input_seq) sorted_scores = list(set(scores)) sorted_scores.sort() answer = [] question = [] if sorted_scores[-1] > 0: one = scores.index(sorted_scores[-1]) answer.append(get_key(question_answer_direct, question_list[one])) question.append(get_key(qa_dict, answer_list[one])) if sorted_scores[-2] > 0: two = scores.index(sorted_scores[-2]) answer.append(get_key(question_answer_direct, question_list[two])) question.append(get_key(qa_dict, answer_list[two])) if sorted_scores[-3] > 0: three = scores.index(sorted_scores[-3]) answer.append( get_key(question_answer_direct, question_list[three])) question.append(get_key(qa_dict, answer_list[three])) return answer, question
def train(): ''' 匹配过程 :return: ''' # 构建匹配语料库 398872 samples sku_names_texts = get_train_datas() sku_names_jieba = get_text_jieba(sku_names_texts) print(len(sku_names_texts), len(sku_names_jieba)) print(sku_names_jieba[0]) # 测试数据 1000 samples keywords_texts = get_test_datas() keywords_jieba = get_text_jieba(keywords_texts) print(len(keywords_texts)) # 统计词表 dictionary = corpora.Dictionary(sku_names_jieba) print(len(dictionary)) # 用gensim建立BM25模型 bm25Model = bm25.BM25(sku_names_jieba) # 根据gensim源码,计算平均逆文档频率 average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys()) for i, item in enumerate(keywords_jieba): scores = bm25Model.get_scores(item, average_idf) # sorted_scores = sorted(scores, reverse=True)[:10] idx = scores.index(max(scores)) print(i, "||", keywords_texts[i], "||", sku_names_texts[idx]) with open("result/bm25_v1_results.txt", 'a', encoding='utf8') as wf: wf.write(str(i) + "||" + keywords_texts[i] + "||" + sku_names_texts[idx] + "\n")
def __init__(self, corpus_file_pattern=None, stop_words_file="stop_words.txt", MAX_LEN=300, path="./"): """ BM25检索模块,主要是在BM25库基础上封装了预处理部分。 :param corpus_file_pattern: 检索资料库-文本数据 str :param stop_words_file: 停用词表 str :param path: 保存的模型目录 str """ os.makedirs(path, exist_ok=True) self.model = os.path.join(path, "bm25.m") self.sen = os.path.join(path, "sen.pkl") self.stop = os.path.join(path, "stop.pkl") self.MAX_LEN = MAX_LEN if os.path.isfile(self.model) and os.path.isfile( self.sen) and os.path.isfile(self.stop): self.load() else: assert corpus_file_pattern is not None, "Can not find model or corpus file." if os.path.isfile(stop_words_file): self.stop_words = self.load_stop_words(stop_words_file) self.sentences, corpus = self.get_corpus(corpus_file_pattern) self.bm25 = bm25.BM25(corpus) self.dump()
def index_bm25(): lemmas = df['lemmas'].tolist() lemmas = [doc.split(' ') for doc in lemmas] dictionary = corpora.Dictionary(lemmas) corpus = [dictionary.doc2bow(text) for text in lemmas] bm25_obj = bm25.BM25(corpus) return bm25_obj, dictionary
def getScores(dictionary): bm25Model = bm25.BM25(corpus) query = [] #对查询词分词 for word in query_str.strip().split(): query.append(word) scores = bm25Model.get_scores(query) return scores
def get_most_similarity_article(self): """ 透過文章corpus取得average_idf,以便後續計算相關 """ dictionary = corpora.Dictionary(self.corpus) bm25Model = bm25.BM25(self.corpus) average_idf = sum(map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len(bm25Model.idf.keys()) return [bm25Model,average_idf]
def doc_process_bm25(raw_docs): gen_docs = [[ w.lower() for w in tokenizer.tokenize(text) if not w in stop_words ] for text in raw_docs] # dictionary = gensim.corpora.Dictionary(gen_docs) # corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs] bm25Model = bm25.BM25(gen_docs) return bm25Model
def _build(self): # extract classes self.classes = self.api.classes self.class_docs = { c.path: self._preprocess_text(c.embedded_text).split(" ") for c in self.classes } self.index = bm25.BM25([self.class_docs[c.path] for c in self.classes])
def init_bm25(self, corpus): ids, questions = [], [] for row in corpus: ids.append(row['id']) questions.append(row['tokens']) self.idx2id = dict([(i, qid) for i, qid in enumerate(ids)]) self.id2idx = dict([(qid, i) for i, qid in enumerate(ids)]) self.bm25 = bm25.BM25(questions)
def creat_model(path_model, path_avgidf): print("Creating bm25 model...") bm25Model = bm25.BM25(corpus) average_idf = sum( map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len( bm25Model.idf.keys()) pickle.dump(bm25Model, open(path_model, 'wb')) pickle.dump(average_idf, open(path_avgidf, 'wb')) return bm25Model, average_idf
def __init__(self): self.tokenizer = tokenizers.get_class('ltp')() self.docdb = retriever.get_class('sqlite')() self.corpus = retriever.utils.load_corpus(retriever.DEFAULTS['bm25_corpus_path']) self.bm25model = bm25.BM25(self.corpus) self.avg_idf = sum(map(lambda k: float(self.bm25model.idf[k]), self.bm25model.idf.keys())) \ / len(self.bm25model.idf.keys()) self.doc_titles = self.docdb.get_doc_ids() self.idx2title = {idx: self.doc_titles[idx] for idx in range(len(self.doc_titles))}
def train(self): if self.fresh: print("重新分词,创建BM25模型...") segs = self.initialize() else: print("从%s读入现有分词结果,创建BM25模型..." % file_questions_segs) segs = read_file(file_questions_segs) segs = [eval(x) for x in segs] self.model = bm25.BM25(segs)
def training_bm25_model(data_set): sentence_tmp_list = list() for data in data_set: seg_list = jieba.cut(data) tmp_list = list() for seg in seg_list: tmp_list.append(seg) sentence_tmp_list.append(tmp_list) model = bm25.BM25(sentence_tmp_list) return model
def build_bm25(datas): corpus = [] docid2index = {} for cur_id, cur_data in enumerate(datas): corpus.append(cur_data['doc1']['tokens_without_stopwords']) docid2index[cur_data['doc1']['docid']] = cur_id * 2 corpus.append(cur_data['doc2']['tokens_without_stopwords']) docid2index[cur_data['doc2']['docid']] = cur_id * 2 + 1 bm25Model = bm25.BM25(corpus) return bm25Model, docid2index
def get_fitness_answer(input_seq): """最后用bm25解决问题""" input_seq = get_final_input(input_seq) question_list, question_answer_direct = get_bm_data() bm25Model = bm25.BM25(question_list) scores = bm25Model.get_scores(input_seq) max_score = max(scores) idx = scores.index(max(scores)) answer = get_key(question_answer_direct, question_list[idx]) answer = str(answer[0]) return max_score, answer
def BM25(question_input): documentation_id = question_input["docid"] sentence = [] corpus = [] temp = [] temp_fini = [] for word in documentation[documentation_id]["text"]: sentence.append([word]) for i in sentence: for k in i: word_list = k.split(" ") corpus.append(word_list) word_list = [] query_str = question_input["question"] query_str_list = query_str.split(" ") simply_corpus = simply(corpus) bm25Model = bm25.BM25(simply_corpus) average_idf = sum( map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len( bm25Model.idf.keys()) scores = bm25Model.get_scores(query_str_list, average_idf) position = scores.index(max(scores)) aim_sentence = documentation[documentation_id]["text"][position] aim_sentence_list = aim_sentence.split(".") for a in aim_sentence_list: temp_word = a.split(" ") for k in temp_word: temp.append(k) temp_fini.append(temp) temp = [] tmp_fini_simply = simply(temp_fini) bm25Model = bm25.BM25(tmp_fini_simply) average_idf = sum( map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len( bm25Model.idf.keys()) scores = bm25Model.get_scores(query_str_list, average_idf) position = scores.index(max(scores)) aim_sentence = aim_sentence_list[position] # print("aim_sentence",aim_sentence) return aim_sentence
def bm25_sim(corpus, sent, topk=5): ''' corpus: ['*****', '******', '******', ...] sent: ['**', '*', '***', '**', ...] ''' model = bm25.BM25(corpus) scores = model.get_scores(sent) scores = sorted(list(enumerate(scores)), key=lambda k: k[1], reverse=True)[:topk] index = [idx[0] for idx in scores]
def create_bm25_model(questions, answers): questions_tokens = [] for q in tqdm(questions, desc="cut"): q_tokens = n_grams(q, n) questions_tokens.append(q_tokens) model = bm25.BM25(questions_tokens) average_idf = sum(float(val) for val in model.idf.values()) / len(model.idf) data = [model, answers, average_idf] save_to_pkl(file=pkl_bm25, data=data) return model, answers, average_idf
def __init__(self, corpus): """ Parameters ---------- corpus : list of list of str Given corpus. """ self.bm25 = bm25.BM25(corpus) self.average_idf = sum( map(lambda k: float(self.bm25.idf[k]), self.bm25.idf.keys())) / len(self.bm25.idf.keys())
def train_text(): # 测试文档预处理 test_doc = [] test_datas = pd.read_csv("test_data.csv", encoding="gbk") test_titles = test_datas["title"] for title in test_titles: test_doc.append(title) test_doc_list = [] for doc in test_doc: doc_list = [word for word in jieba.cut(doc)] test_doc_list.append(doc_list) # 训练集预处理,去除噪声 train_doc = [] datas = pd.read_csv("train_data.csv") train_titles = datas["title"] for title in train_titles: if 13 < len(title) < 500: doc_list = [word for word in jieba.cut(title)] all_doc_list.append(doc_list) else: all_doc_list.append("。") all_doc_list = [] for doc in train_doc: doc_list = [word for word in jieba.cut(doc)] all_doc_list.append(doc_list) # 制作词典 dictionary = corpora.Dictionary(all_doc_list) dictionary.keys() print(dictionary.num_pos) dictionary.filter_extremes(no_below=25, no_above=0.5, keep_n=12330000) bm25Model = bm25.BM25(all_doc_list) average_idf = sum( map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len( bm25Model.idf.keys()) print(average_idf) results = [] for doc_test_list in test_doc_list: score = bm25.BM25.get_scores(bm25Model, doc_test_list, average_idf) similiar_sorted = sorted(enumerate(score), key=lambda item: -item[1])[:21] indexs = [str(item[0] + 1) for item in similiar_sorted] results.append(" ".join(indexs)) #写入文件 with open("answers.txt", "w") as f: for item in results: item = item.strip().split() f.write("source_id" + "\t" + "target_id" + "\n") for i in range(1, 21): f.write(item[0] + "\t" + item[i] + "\n")
def load_bm25(): data = load_data() lst_question = data['question'].apply( lambda x: preprocessing(str(x))).tolist() # lst_answer = data['answer'].apply(lambda x: preprocessing(str(x))).tolist() # lst_qa = lst_question + lst_answer texts = [item.split() for item in lst_question] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] bm25_obj = bm25.BM25(corpus) params = {"BM25": bm25_obj, "texts": texts, 'dictionary': dictionary} return params
def BMsort(list1, query_str): ##list1指新闻构成的列表,query_str:指用户输入检索词 dic = corpora.Dictionary(list1) bm25Model = bm25.BM25(list1) average_idf = sum( map(lambda k: float(bm25Model.idf[k]), bm25Model.idf.keys())) / len( bm25Model.idf.keys()) query_str = jieba.cut(query_str) query_str = " ".join(query_str) query = [] for word in query_str.strip().split(): query.append(word) scores = bm25Model.get_scores(query, average_idf) return scores
def baselines_eval(): rankings_to_eval = read_query_test_rankings() qrels = parse_qrels() query_ids = list(qrels.keys()) query_lookup = get_robust_eval_queries() queries = [query_lookup[query_id] for query_id in query_ids] k = 10 if len(sys.argv) == 1 else int(sys.argv[1]) document_lookup = read_cache(name('./doc_lookup.json', ['with_titles']), get_robust_documents_with_titles) document_title_to_id = read_cache('./document_title_to_id.json', lambda: print('failed')) ordered_rankings_to_eval = [[ document_title_to_id[title] for title in rankings_to_eval[query] ] for query in query_ids] ordered_qrels = [[document_title_to_id[title] for title in qrels[query]] for query in query_ids] document_id_to_title = _.invert(document_title_to_id) doc_ids = range(len(document_id_to_title)) documents = [ document_lookup[document_id_to_title[doc_id]] for doc_id in doc_ids ] tokenizer = Tokenizer( rules=[handle_caps, fix_html, spec_add_spaces, rm_useless_spaces]) tokenized_documents = read_cache( 'tok_docs.json', lambda: tokenizer.process_all(clean_documents(documents))) tokenized_queries = tokenizer.process_all(clean_documents(queries)) bm25 = gensim_bm25.BM25(tokenized_documents) # with open('./caches/106756_most_common_doc.json', 'r') as fh: # doc_token_set = set(json.load(fh)) # corpus, token_lookup = tokens_to_indexes(tokenized_documents, # None, # token_set=doc_token_set) # corpus = [[[token_lookup[term], f] for term, f in doc_fs.items()] for doc_fs in bm25.f] # tfidf = TfidfModel(corpus) # lsi = LsiModel(tfidf, id2word=_.invert(token_lookup), num_topics=300) glove_rankings = [] # lsi_rankings = [] glove = get_glove_lookup(embedding_dim=300, use_large_embed=True) encoded_docs = torch.stack( [encode_glove_fs(glove, bm25.idf, doc_fs) for doc_fs in bm25.f]) encoded_docs = encoded_docs / torch.norm(encoded_docs, dim=1).unsqueeze(1) for q, qml_ranking in progressbar(zip(tokenized_queries, ordered_rankings_to_eval), max_value=len(tokenized_queries)): doc_ids = qml_ranking[:k] if '--rerank' in sys.argv else None glove_rankings.append( rank_glove(glove, bm25.idf, encoded_docs, q, doc_ids=doc_ids)) # lsi_rankings.append(rank_lsi(lsi, tfidf, [token_lookup[term] if term in token_lookup else 0 for term in q], doc_ids=doc_ids)) print('indri:', metrics_at_k(ordered_rankings_to_eval, ordered_qrels, k)) print('glove:', metrics_at_k(glove_rankings, ordered_qrels, k))
def bm25_sim(corpus, result, topk=5): data = [] with open(corpus, encoding='utf-8') as fcor: for line in tqdm(fcor): line = json.loads(line.strip()) data.append(line['query']) with open(result, 'w', encoding='utf-8') as fres: stime = time.time() step = 1 while len(data) > 5000: query = data.pop(0) model = bm25.BM25(data) scores = model.get_scores(query) scores = sorted(list(enumerate(scores)), key=lambda k: k[1], reverse=True) index = [scores[0][0]] index_remove = [scores[0][0]] temp = ''.join(query) for i in range(1, 100): idx, score = scores[i] if abs(score - scores[i - 1][1]) > 1e-4 and temp != ''.join( data[idx]): index.append(idx) index_remove.append(idx) if len(index) >= topk or score < 15.0: break ids = len(index_remove) for i in range(ids, 100): if scores[i][1] >= scores[ids - 1][1] - 1.0: index_remove.append(scores[i][0]) if scores[0][1] > 20.0 and len(index) >= 3: output = {} output['query'] = ' '.join(query) output['candidates'] = [' '.join(data[i]) for i in index] fres.write(json.dumps(output, ensure_ascii=False) + '\n') index_remove = sorted(index_remove, reverse=True) for idx in index_remove: data.pop(idx) if step % 100 == 0: fres.flush() print('step: %d, spend-time: %.4f' % (step, time.time() - stime)) stime = time.time() step += 1