def relevance_finder(query, df, flag): df_new = df.reset_index() if flag == 'q': bm = BM25(df_new['tokenised question']) tf = bm.get_tf_for_query(query) idf = bm.get_idf_for_query(query) relevance_score = bm.get_bm25_scores(query, tf, idf) df_new['relevance_score'] = relevance_score # df_new.sort_values(['relevance_score'], ascending=False) df_new.sort_values(['relevance_score'], ascending=False, inplace=True) return df_new else: bm = BM25(df_new['tokenised reviews']) tf = bm.get_tf_for_query(query) idf = bm.get_idf_for_query(query) relevance_score = bm.get_bm25_scores(query, tf, idf) df_new['relevance_score'] = relevance_score # df_new.sort_values(['relevance_score'], ascending=False) df_new.sort_values(['relevance_score'], ascending=False, inplace=True) return df_new
def do_bm25(query, df, best_n=3): query = normalize_terms(word_tokenize(query)) text_lines_total = df.text.values text_lines_tokens = [] for title in text_lines_total: title = re.sub(r'\W', ' ', title) # matches any non-word character title = re.sub(r'\s+', ' ', title) # matches any whitespace character title = title.replace(' ', ' ') title = title.replace(' ', ' ') tx_line = word_tokenize(title) text_lines_tokens.append(tx_line) news = [normalize_terms(sentence) for sentence in text_lines_tokens] bm25 = BM25(news) best_indexes = bm25.ranked(query, best_n) best_sentences = [] for ind in best_indexes: sentence_found = text_lines_total[ind] best_sentences.append(sentence_found) return best_sentences
def relevant_docs_from_posting(self, query): """ This function loads the posting list and count the amount of relevant documents per term. :param query: query :return: dictionary of relevant documents. """ # -------------------- methods ------------------------------------ if self.wordnet_toggle: query = wn.expand_query(query, self._parser.stop_words) if self.spelling_corr_toggle: query = sp.correct_spelling(query) # ----------------------------------------------------------------- N = self._parser.num_of_docs total_len = self._parser.total_doc_length inverted_idx = self.indexer.inverted_idx # cosine_sim = CosineSimCalculator(inverted_idx, query, N) # cosine_sim.create_wiq_dict() # relevant_docs = cosine_sim.calc_similarity() bm25 = BM25(inverted_idx, query, N, total_len) bm25.create_wiq_dict() relevant_docs = bm25.calc_bm25() for doc, socre in relevant_docs.items(): if doc in self._parser.retweet_dict: relevant_docs[doc] += self._parser.retweet_dict[doc] return relevant_docs
def idf(): word = request.args.get('word') bm25 = BM25() result = dict() result['word'] = word result['idf'] = bm25.idf(word) return jsonify(result)
def search(self, query, user_resource=None): self.index_search_result = dict() self.content_search_result = dict() tokenizer = Tokenizer() query_tokens = tokenizer.processItem(query) self.index_search_result, self.content_search_result = self.db.get_content_by_index( query_tokens) combined_result = combine_index_content_result(\ self.index_search_result,\ self.content_search_result\ ) bm25 = BM25(query_tokens) #bm25 get_relevance_score for combined result score = bm25.get_relevance_score(combined_result) #pagerank if user_resource is None: pr_score = self.pr.get_score_for_search(self.content_search_result) else: pr_score = PageRank.filter_score_from_pr_score( self.content_search_result, user_resource["pr_score"]) combined_score = combine_score(score, pr_score) def get_score(content): print(content['url'], ": BM25 : ", score[content['url']], "PR: ", pr_score[content['url']]) return combined_score[content['url']] return sorted(self.content_search_result, key=get_score, reverse=True)
def train_bm25(queries: List[Query], collection: Corpus) -> List[Query]: bm25 = BM25(collection) queries_list: List[Query] = [] for query in queries: top_10 = bm25.top_n(query, n=TOP_N) query.update_answers(top_10, n=TOP_N) queries_list.append(query) return queries_list
def __init__(self, docs): self.docs = docs self.bm25 = BM25(docs) self.D = len(docs) self.d = 0.85 self.weight = [] self.weight_sum = [] self.vertex = [] self.max_iter = 200 self.min_diff = 0.001 self.top = []
def __init__(self, docs): self.d = 0.85 self.max_iter = 200 self.min_diff = 0.001 self.docs = docs self.bm25 = BM25(docs) self.D = len(docs) self.weight = [] self.weight_sum = [] self.vertex = defaultdict(lambda: 1) self.top = {} self.solve()
def bm25(data_dir, dataset_type, tokenizer=default_tokenizer): train_path = os.path.join(data_dir, 'train.csv') if os.path.isfile(train_path + ".bm25"): bm25 = pickle.load(open(train_path + ".bm25", 'rb')) return bm25 train_dataset_reader, _ = getReadersByDatasetType(dataset_type) answers = [] for (q, a) in train_dataset_reader.conversations(train_path): answers.append(a) bm25 = BM25(tokenizer) bm25.fit(answers) pickle.dump(bm25, open(train_path + ".bm25", 'wb')) return bm25
def search(self, query, user_resource=None): self.index_search_result = dict() self.content_search_result = dict() #tokenizer = Tokenizer() #query_tokens = tokenizer.processItem(query) query_tokens = query.true_tokens print(query.token_weights) self.index_search_result, self.content_search_result = self.db.get_content_by_index(query_tokens, query.token_weights) print(len(self.index_search_result), len(self.content_search_result)) combined_result = combine_index_content_result(\ self.index_search_result,\ self.content_search_result\ ) bm25 = BM25(query_tokens) #print("tokens", query_tokens) #print("combined result", combined_result) #bm25 get_relevance_score for combined result #score = bm25.get_relevance_score(combined_result, tags) #pagerank if user_resource is None: score = bm25.get_relevance_score(combined_result) pr_score = self.pr.get_score_for_search(self.content_search_result) combined_score = combine_score(score, pr_score) else: user = current_user score = bm25.get_relevance_score(combined_result, user.tags) pr_score = PageRank.filter_score_from_pr_score(self.content_search_result, user_resource["pr_score"]) combined_score = combine_score(score, pr_score, pr=user.pr, bm25 = user.bm25) #print(user_resource, pr_score) #combined_score = combine_score(score , pr_score) def get_score(content): print(content['url'], ": BM25 : ", score[content['url']], "PR: ", pr_score[content['url']]) return combined_score[content['url']] return sorted([content[0] for content in self.content_search_result], key=get_score, reverse=True)
def __init__(self, text): self.sentences = self.split_text(text) self.docs = self.sentences2docs(self.sentences) self.bm25 = BM25(self.docs)
GAMMA = 0.15 list_queries = utils.load_queries(utils.PARSED_QUERIES) inverted_index = utils.load_inverted_index( os.path.join(utils.INDEX_DIR, "stem_False_stop_False_inverted_index.txt")) # args = {debug: False, isstemmed: False, isstopped: False} parser = argparse.ArgumentParser(description="Parser for JM Smoothing") parser.add_argument("-d", "--debug", action="store_true") parser.add_argument("-stem", "--isstemmed", action="store_true") parser.add_argument("-stop", "--isstopped", action="store_true") args = parser.parse_args() baseline_run = BM25(args, inverted_index, utils.load_corpus_stats(), list_queries) # dict query_id: [[doc_name,score],[],[] ....] baseline_run.compute_scores() results = baseline_run.bm25_scores query_mapping = utils.load_query_map() def get_content(doc_names): doc_contents = [] for doc in doc_names: with open(os.path.join(utils.CORPUS_DIR, "{}.txt".format(doc)), "r") as f: doc_contents.append(f.read()) return doc_contents
eval_path = './eval/eval.txt' # 返回的相关词个数 k1 = 5 k2 = 15 if __name__ == '__main__': start = time.time() print('开始执行') # 构建查询 print('根据文件' + query_path + '构建查询并作查询扩展') query_list = build_query(query_path, w2v_path, vocab_path, k1) print('构建查询完毕') # bm模型 print("构建BM25模型") bm = BM25() print('构建BM25模型完毕') # 导入倒排表 print('从' + invert_table_path + '处导入倒排表') bm.build(invert_table_path) print('导入完毕') # 查询 print("开始查询") res = start_query(bm, query_list, k2) print('存储查询结果到目录' + res_doc_path) get_doc_cont(res, res_doc_path, doc_path) # 计算p@10 print('存储评估所需文件至' + res_path) eval_res(res, res_path) # 测试 print("开始评估结果")
search_key="" for i in q.split(): if i not in Retrieved_Stopword: search_key+=i+" " search_key=search_key.rstrip() #Removing last space search_key_length=len(search_key.split()) if search_key_length>1 and search_key not in Final_Document: Final_Document.append(search_key) temp="" adding_final_document="" bm25 = BM25(Final_Document[:1000]) split_final_document="" position=0 for index, score in bm25.ranked(query, 25): #print('{} ->> {} ->> {}'.format(position, ''.join(Final_Document[index]), score )) ##Printing Ranked Document temp+=Final_Document[index] temp=''.join(temp+" ") adding_final_document+=temp # Adding sentence position=position+1
pprint.pprint(dictOf) print('') print('query :' + q) print('+----------------------------------+') print('') print('Pembobotan TF-IDF') tfidf = TfIdf().transform(q=q, document=document) print("Bobot rata-rata: " + str(tfidf.weight_average())) pprint.pprint(tfidf.get_weight()) print("+---------------------------------+") print('') print('Pembobotan W-IDF') widf = WIdf().transform(q=q, document=document) print("Bobot rata-rata: " + str(widf.weight_average())) pprint.pprint(widf.get_weight()) print("+---------------------------------+") print('') print('Pembobotan TFRF') tfrf = TFRF().transform(q=q, document=document) print("Bobot rata-rata: " + str(tfrf.weight_average())) pprint.pprint(tfrf.get_weight()) print("+---------------------------------+") print('') print('Pembobotan BM25') bm25 = BM25().transform(q=q, document=document) pprint.pprint(bm25.weigth()) print("+---------------------------------+")
# - remover stopwords # - remover numerais # - stemming return [remove_diacritics(term).lower() for term in terms] def remove_diacritics(text, encoding='utf8'): """Remove diacritics from bytestring or unicode, returning an unicode string""" nfkd_form = unicodedata.normalize('NFKD', to_unicode(text, encoding)) only_ascii = nfkd_form.encode('ASCII', 'ignore') return only_ascii.decode(encoding) def to_unicode(text, encoding='utf8'): """Convert a string (bytestring in `encoding` or unicode), to unicode.""" if isinstance(text, six.text_type): return text return text.decode(encoding) # nltk.download('mac_morpho') # `news` é uma list que contém listas de tokens news = [normalize_terms(sentence) for sentence in mac_morpho.sents()] print(repr(news[0])) # Estou utilizando os 1000 primeiros pra exemplificar. Processar todos os 51397 demora um tempinho bm25 = BM25(news[:1000]) query = normalize_terms(nltk.word_tokenize('inflacao')) for position, index in enumerate(bm25.ranked(query, 5)): print('{} - {}'.format(position, ' '.join(news[index])))
def __init__(self, corpus, b=0.75, k1=1.2): self.corpus = corpus logger.debug(f"Instantiating BM25Search with k1={k1} b={b}") self.vectorizer = BM25(b=b, k1=k1) self.vectorizer.fit(corpus)
def search(request): key = request.GET.get('key') alogorithm_type = request.GET.get('alogorithm_type') print('we get alogorithm_type', alogorithm_type) if alogorithm_type != '2': alogorithm_type = '1' key_name = '{}_{}'.format(key, alogorithm_type) print('alogorithm_type', alogorithm_type) alogorithm_type = int(alogorithm_type) if alogorithm_type == 2: cache.delete('{}_{}'.format(key, 1)) print('this is bm25') else: cache.delete('{}_{}'.format(key, 2)) print('this is tfidf') print(key_name) papers = cache.get(key_name) print(papers, 'cache') if papers is None: if alogorithm_type == 2: print('invoke BM25 algorithm') start = time.time() papers = BM25(key) end = time.time() print('BM25 spend', end - start) else: print('invoke TFIDF algorithm') start = time.time() papers = TFIDF(key) end = time.time() print('TFIDF spend', end - start) # 127.0.0.1: 8000 / api / search?key = design & alogorithm_type_type = 1 & order = 1 & descend = 1&year=2015-2020&author=Zelalem Mekuria&venue=ccf # alogorithm 1 代表 tfidf, 2代表bm25,order 1 year,2citation;descend 1 降序,2 升序;后面就是按照输入过滤了 # order: 1:year 2: citation # descend: 1 降序 2: 升序 # 排序c order_by_date = request.GET.get('order') descend = request.GET.get('descend') if descend == '1': descend = True else: descend = False if order_by_date == '1': papers = sorted(papers, key=lambda x: x.year, reverse=descend) elif order_by_date == '2': papers = sorted(papers, key=lambda x: x.n_citation, reverse=descend) # filter year = request.GET.get('year') print('this is year', year) if year is not None and year != '': begin, end = year.split('-') # if begin==end or begin=='0000': temp = [] for paper in papers: # localhost:8000/api/test?key=design&alogorithm=1&order=1&descend=1&year=2013-2014 # localhost:8000/api/test?key=design&alogorithm=1&order=1&descend=1 try: begin_date = datetime(year=int(begin), month=1, day=1, tzinfo=pytz.utc) end_date = datetime(year=int(end), month=1, day=1, tzinfo=pytz.utc) if paper.year <= end_date and paper.year >= begin_date: temp.append(paper) except Exception: temp = [] break papers = temp author = request.GET.get('author') if author is not None and author != '': temp = [] for paper in papers: exist = paper.authors.filter(name=author).exists() print('author', author) print('exist', exist) if exist: temp.append(paper) papers = temp venue = request.GET.get('venue') if venue is not None and venue != '': temp = [] for paper in papers: if paper.venue == venue: temp.append(paper) papers = temp # # History.objects.create() # print('key', key) # print('alogorithm_type', alogorithm_type) serializer = PaperSerializer(papers, many=True) return Response(serializer.data)
def search(): word = request.args.get('q') bm25 = BM25() results = bm25.search(word) return jsonify(results)
def __init__(self): # switch of train, dev, test model train = 0 dev = 0 test = 1 # switch of loading data from pkl or reprocessing load_processed_doc = 1 load_doc_from_pkl = 1 # switch of testing BM25 accuracy test_BM25 = 0 self.data = Data() self.config = Config() self.fileLoader = FileLoader(self.config, self.data) self.bdp = BasicDataProcessor(self.config, self.data) self.bm25 = BM25(self.config, self.data) # not used ner tags, will merge them together with 'O' tag self.other = [ 'SET', "MISC", 'EMAIL', 'URL', 'TITLE', 'IDEOLOGY', 'CRIMINAL_CHARGE' ] self.fileLoader.load_doc() # load doc data if load_processed_doc: if load_doc_from_pkl: with open(self.config.doc_processed_path, 'rb') as f: self.data.doc_processed = pickle.load(f) else: self.data.doc_processed = self.bdp.process_docs( self.data.doc_texts) with open(self.config.doc_processed_path, 'wb') as f: pickle.dump(self.data.doc_processed, f) # load train data if train: self.fileLoader.load_training_data() if test_BM25: self.bm25.test_training_BM25_accuracy(10) return # predict answer # self.predict_with_bm25_pars_sents(0) self.predict_with_bm25_sents(0) # load dev data if dev: self.fileLoader.load_dev_data() if test_BM25: self.bm25.test_BM25_par_on_dev() return # predict answer self.predict_with_bm25_pars_sents(1) # self.predict_with_bm25_sents(1) # load test data if test: self.fileLoader.load_test_data() # predict answer # self.predict_with_bm25_pars_sents(2) self.predict_with_bm25_sents(2)
from os import listdir from helpers import * from bm25 import BM25 from bm25plus import BM25plus from rank_bm25 import BM25Okapi import matplotlib.pyplot as plt path = 'input/' dirs = [f for f in listdir(path)] print(f"{len(dirs)} documents in total") documents = [] for fname in dirs: with open(path+fname, "r") as f: documents.append(f.read()) bm25 = BM25(documents, k1=1.2, b=0.75) scores = bm25.get_scores("best apps daily activity exercise diabetes") l = [] for i, score in enumerate(scores): l.append((score, i)) l.sort(key=lambda x:x[0], reverse=True) i = l[0][1] with open('output.txt', 'w') as f: f.write(documents[i]) with open("topic.txt", "r") as f: query_list = [] for line in f: query_list.append(line.strip()) print(f"{len(query_list)} queries in total")