def get_similarity(query, documents): docs = query + documents docs = [word_token(d, lemma=True) for d in docs] tokenized_corpus = [doc.split(' ') for doc in docs] # print(tokenized_corpus) # print(tokenized_corpus) bm25 = BM25Okapi(tokenized_corpus[1:]) bm25plus = BM25Plus(tokenized_corpus[1:]) bm25L = BM25L(tokenized_corpus[1:]) query = tokenized_corpus[0] # print(query) bm25_scores = bm25.get_scores(query) bm25plus_scores = bm25plus.get_scores(query) bm25L_scores = bm25L.get_scores(query) bm25_scores = [(i, v) for i, v in enumerate(bm25_scores)] bm25plus_scores = [(i, v) for i, v in enumerate(bm25plus_scores)] bm25L_scores = [(i, v) for i, v in enumerate(bm25L_scores)] bm25_scores.sort(key=lambda x: x[1], reverse=True) bm25plus_scores.sort(key=lambda x: x[1], reverse=True) bm25L_scores.sort(key=lambda x: x[1], reverse=True) # print(bm25_scores) # print(bm25plus_scores) # print(bm25L_scores) # print(bm25_scores) # print(bm25plus_scores) # print(bm25L_scores) return bm25_scores, bm25plus_scores, bm25L_scores
def bm25_classifier(query, descriptions, labels): """ Computes BM25 scores of a given query in relation to all selected and preprocessed datasets and selects all datasets that exeed the threshold mean+3*sd. input: query and list of lables, output: list of labels that fit the query """ preprocessed_descriptions = [] for description in descriptions: preprocessed_descriptions.append( preprocessing.preprocess(str(description))) tokenized_corpus = [doc.split(" ") for doc in preprocessed_descriptions] bm25_modell = BM25Plus(tokenized_corpus) tokenized_query = query.split(" ") scores = bm25_modell.get_scores(tokenized_query) mean_scores = mean(scores) standard_deviation_scores = stdev(scores) selected = [] for i in range(0, len(descriptions)): label = labels[i] description = descriptions[i] score = scores[i] if score > (mean_scores + 4 * standard_deviation_scores): selected.append(label) return selected
def get_sparse_embedding(self): # Pickle save. pickle_name = f"bm25_pororo.bin" emd_path = os.path.join(self.data_path, pickle_name) if os.path.isfile(emd_path): with open(emd_path, "rb") as file: self.bm25 = pickle.load(file) print("Embedding pickle load.") else: print("Build passage embedding") self.bm25 = BM25Plus(self.tokenized_contexts) with open(emd_path, "wb") as file: pickle.dump(self.bm25, file) print("Embedding pickle saved.")
def get_sentence_rank(self, query, top50_rank_list): self.query = query self.top50_rank_list = top50_rank_list top50_files_sent = [] # for filename, rank in self.top50_rank_list: # filename = filename.strip('.txt') # sent_filename = filename + '_1.txt' # top50_files_sent.append(sent_filename) # f = open(sent_filename, 'r') # document = f.read() # f.close() # sentences = nltk.sent_tokenize(document) # this gives us a list of sentences # # now loop over each sentence and tokenize it separately # sentence_terms = '' # for sentence in sentences: # sentence_terms = self.tokenize(sentence) # self.sentence_corpus.append(sentence_terms) for id, rank in self.top50_rank_list: filename = id.replace('Extracted Docs', 'Unprocessed_Docs') f = open(filename, 'r') document = f.read() f.close() sentences = nltk.sent_tokenize( document) # this gives us a list of sentences # now loop over each sentence and tokenize it separately sentence_terms = '' for sentence in sentences: sentence_terms = self.tokenize(sentence) self.sentence_corpus.append(sentence_terms) if (len(self.sentence_corpus) == 0): self.sentence_corpus = ['a'] bm25 = BM25Plus(self.sentence_corpus) # query1 = "what debts did qintex group leave ?" # tokenized_query = self.tokenize(self.query) sent_scores = bm25.get_scores(self.query) # print(sent_scores) # sent_scores_sorted = np.argsort(sent_scores)[::-1][:self.no_of_docs_reqd] top_50_sentences = bm25.get_top_n(self.query, self.sentence_corpus, n=self.no_of_docs_reqd) clean_sentences = [] for tokened_sentence in top_50_sentences: clean_sentence = " ".join(tokened_sentence) clean_sentences.append(clean_sentence) # print(clean_sentence) return clean_sentences
def get_embedding_BM25(self): pickle_name = f"BM25_embedding.bin" emd_path = os.path.join(self.data_path, pickle_name) if os.path.isfile(emd_path): with open(emd_path, "rb") as file: self.BM25 = pickle.load(file) print("BM25 Embedding pickle load.") else: print("Build passage BM25_embedding") tokenized_contexts = [self.tokenizer(i) for i in self.contexts] self.BM25 = BM25Plus(tokenized_contexts) with open(emd_path, "wb") as file: pickle.dump(self.BM25, file) print("BM25 Embedding pickle saved.")
def process_document(tokenized_corpus: [[str]], tokenized_document: [str], buffer: [dict], queries: set[str]) -> [dict]: prev_word = '' word_ind = 1 words_total = len(tokenized_document) for word in tokenized_document: if word != chunk_keyword and '’' not in word and len( word) >= max_word_len: query = chunk_keyword + ' ' + prev_word + ' ' + word # query = chunk_keyword + ' ' + word tokenized_query = tokenize_query(query) orders = [] for ind, bm25 in enumerate([ BM25L(tokenized_corpus), BM25Okapi(tokenized_corpus), BM25Plus(tokenized_corpus) ]): documents_order = get_documents_order(tokenized_query, bm25, ind) orders.append(documents_order) if all_different(orders) and (prev_word not in queries) and (word not in queries): query_entry = { 'query': query, 'orders': list(zip(['BM25L', 'BM25Okapi', 'BM25Plus'], orders)) } buffer.append(query_entry) queries.add(prev_word) queries.add(word) prev_word = word if word_ind % 100 == 0: print(f'{word_ind}/{words_total} words processed') word_ind += 1 return buffer
def get_doc_rank(self, query, scores): self.query = query self.scores = scores for id in self.scores: f = open(id, 'r') document = f.read() f.close() doc_terms = self.tokenize(document) self.doc_corpus.append(doc_terms) if (len(self.doc_corpus) == 0): self.doc_corpus = ['a'] bm25 = BM25Plus(self.doc_corpus) # query1 = "what debts did qintex group leave ?" # tokenized_query = self.tokenize(self.query) doc_scores = bm25.get_scores(self.query) # print(doc_scores) doc_scores_sorted = np.argsort(doc_scores)[::-1] # top_n = bm25.get_top_n(tokenized_query, self.doc_corpus, n=1) # print(doc_scores_sorted) self.write_to_dict(doc_scores_sorted.tolist()) self.top50_rank_list = self.top50_rank() return self.top50_rank_list
@application.route('/search/', methods=['POST']) def search(): query = request.form['query'].strip() docs = engine.get_top_k_docs(query, k=50) return jsonify(docs) @application.route('/save_relevance/', methods=['POST']) def save_relevance(): query = request.form['query'].strip() doc_id = request.form['doc_id'] rel_score = request.form['rel_score'] engine.store_relevance_judgements(query, doc_id, rel_score) return ('', 204) if __name__ == '__main__': print('Initializing Search Engine...') corpus = get_corpus('data/corpus1.pkl') + get_corpus('data/corpus2.pkl') tokenized_corpus = get_tokenized_corpus( 'data/tokenized_corpus1.pkl') + get_tokenized_corpus( 'data/tokenized_corpus2.pkl') model = BM25Plus(tokenized_corpus) engine = SearchEngine(model, corpus, 'data/relevance_feedback.txt') print('Done!') application.run()
def get_resp(): if request.method == 'POST': sent = request.form["sent"] query = request.form["query"] weight1 = request.form["weight1"] weight2 = request.form["weight2"] n = request.form["n"] ###### if weight1 == "": weight1 = 0.5 else: weight1 = float(weight1) if weight2 == "": weight2 = 0.5 else: weight2 = float(weight2) if n == "": n = 3 else: n = int(n) ###### sentences = sent.split(',') candidate_embeddings = module.signatures['response_encoder']( input=tf.constant(sentences), context=tf.constant(sentences))['outputs'] query_embedding = module.signatures['question_encoder'](tf.constant( [query]))['outputs'][0] similarities = angular_similarity([query_embedding], candidate_embeddings) score = similarities[ 0] ########### Ranker 1 - USEQA Angular ######### #### BM25+ #### preprocessed_query = preprocess(query) preprocessed_sentences = [preprocess(sent) for sent in sentences] tokenized_corpus = [doc.split(" ") for doc in preprocessed_sentences] ranker = BM25Plus(tokenized_corpus) tokenized_query = preprocessed_query.split(" ") r2 = ranker.get_scores(tokenized_query) w1 = weight1 w2 = weight2 print("######################") print("weight one : ", w1) print("weight two : ", w2) print("####################") weighted_final = w1 * score + w2 * r2 zipped = list(zip(weighted_final, sentences)) resp1 = return_top_n(zipped, n) final = {"Query": query, "result": resp1} return (final) return '''<form method="POST">
# ---------------------------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------------------------- from rank_bm25 import BM25Okapi, BM25L, BM25Plus from Functions import * from QEModels import * corpus = DOCTEXT_list tokenized_corpus = [doc.split(" ") for doc in corpus] # Retrieval MODELS --------------------------------------------------------------------------------------------------- bm25 = BM25Okapi(tokenized_corpus) bm25l = BM25L(tokenized_corpus) bm25plus = BM25Plus(tokenized_corpus) # RANKING -------------------------------------------------------------------------------------------------------------- # Original Query - - - - - - - - -- - - - - -- - - -- - -- - -- - -- - - -- - -- - - - --- - - - - - -- - - -- - - -- - runLines = [] with open("robust-BM25-run.txt", "w") as frun: for i in range(len(QUERYTEXT_list)): query = QUERYTEXT_list[i] tokenized_query = query.split(" ") print(tokenized_query) # BM25 doc_scores_bm25 = bm25.get_scores(tokenized_query) bm25TopN = bm25.get_top_n(tokenized_query, corpus, n=1000) for j in bm25TopN:
print("字典造好") # tokenized_corpus = [doc.split(" ") for doc in respList] # bm25 = BM25Okapi(tokenized_corpus) print("训练好") mesIdList = test["message_id"].unique() midL = []; ridL = [] for mid in mesIdList: midL+=[mid for i in range(10)] query = getText(mid) tokenized_query = query.split(" ") corpus = tsv[tsv["message_id"]==mid]["response"].drop_duplicates("first").tolist() tokenized_corpus = [str(doc).split(" ") for doc in corpus] bm25 = BM25Plus(tokenized_corpus) res = bm25.get_top_n(tokenized_query, corpus, n=10) res1 = list(map(lambda x:getId(x),res)) if len(res1)<10: res2 = random.sample(list(respIdList),10-len(res1)) # # remainid = test["response_id"].drop_duplicates(keep="first").values # # print(remainid) # remaincorpus = list(map(lambda x: getText(x), remainid)) # tokenized_corpus = [doc.split(" ") for doc in remaincorpus] # bm25 = BM25L(tokenized_corpus) # res = bm25.get_top_n(tokenized_query, remaincorpus, n=10-len(corpusid)) # # res2 = list(map(lambda x: text2id[x], res)) res1 = list(res1) + list(res2) print(len(res1))
def corpus_index(): cache_dict = open_cache() corpus = list(cache_dict.values()) tokenized_corpus = [remove_stopwords(str(doc).split(" ")) for doc in corpus] bm25plus = BM25Plus(tokenized_corpus) return corpus, bm25plus, cache_dict