class SentenceTransformersReranker(Reranker): def __init__( self, pretrained_model_name_or_path='cross-encoder/ms-marco-MiniLM-L-2-v2', max_length=512, device=None, use_amp=False): device = device or ('cuda' if torch.cuda.is_available() else 'cpu') self.use_amp = use_amp self.model = CrossEncoder(pretrained_model_name_or_path, max_length=max_length, device=device) def rescore(self, query: Query, texts: List[Text]) -> List[Text]: texts = deepcopy(texts) with torch.cuda.amp.autocast(enabled=self.use_amp): scores = self.model.predict( [(query.text, text.text) for text in texts], show_progress_bar=False, ) for (text, score) in zip(texts, scores): text.score = score.item() return texts
def get_sbert_mostsimilar_crossencoder(vecs, query, query_vec, num_responses): distances, most_similar = most_sim_cos(vecs, query_vec, 50) from sentence_transformers import CrossEncoder model = CrossEncoder('cross-encoder/stsb-roberta-base') cross_inp = [[query, i] for i in most_similar] cross_scores = model.predict(cross_inp) cross_scores, most_similar = zip( *sorted(zip(cross_scores, most_similar), reverse=True)) return list(cross_scores[:num_responses]), list( most_similar[:num_responses])
def get_answers_from_query(request): """ Uses infromational retrieval methods to get answers from user query about the inputed text. These queries are answered using the BERT NLP transformer. Input ---------- request variable: Flask request variable containing the text for the article Returns ---------- The answers to the query. """ text = request.form['text'] text = check_for_url(text) sentences = nltk.sent_tokenize(text) sentences = [s for s in sentences if s[-1] != "?"] query = request.form['query'] ''' This uses a Cross Encoder variant of a transformer. It is designed to return the most likely response given an input. i.e - its designed for question answering ''' model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2') model_inputs = [[query, passage] for passage in sentences] print(model_inputs) scores = model.predict(model_inputs) #Sort the scores in decreasing order results = [{ 'input': inp, 'score': score } for inp, score in zip(model_inputs, scores)] results = sorted(results, key=lambda x: x['score'], reverse=True) answers = [] print("Query:", query) for hit in results[0:3]: print("Score: {:.2f}".format(hit['score']), "\t", hit['input'][1], '\n') if hit['score'] > 0.0: answers.append(hit['input'][1]) return answers
def _get_relevant_comments_helper(comments, query, query_embedding, corpus_embeddings): hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=10) hits = hits[0] cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6') cross_inp = [[query, comments[hit['corpus_id']]] for hit in hits] cross_scores = cross_encoder.predict(cross_inp) for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) #print top 10 hits # for hit in hits[:10]: #print(hit['score'], comments[hit['corpus_id']]) return hits[:10]
pickle.dump( ranker, open("ranker/constrastive_loss/ranker_contrastive_loss_20_epochs.pkl", "wb")) from tqdm.notebook import tqdm run = {} for topic in tqdm(topics): number = topic["number"] query = topic["title"] extracted_ids = [k for k in qrel[number].keys()] doc_ids = [] for id in extracted_ids: try: db.lookup_docno(id) doc_ids.append(id) except: continue texts = db.batch_docno_lookup(doc_ids) pairs = list(zip([query] * len(texts), texts)) scores = ranker.predict(pairs) scores = scores.tolist() doc_scores = dict(zip(doc_ids, scores)) run[number] = doc_scores print_cum_stats(run)
def semantic_answer_similarity( predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" ) -> Tuple[List[float], List[float]]: """ Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1. Returns per QA pair a) the similarity of the most likely prediction (top 1) to all available gold labels b) the highest similarity of all predictions to gold labels :param predictions: Predicted answers as list of multiple preds per question :param gold_labels: Labels as list of multiple possible answers per question :param sas_model_name_or_path: SentenceTransformers semantic textual similarity model, should be path or string pointing to downloadable models. :return top_1_sas, top_k_sas """ assert len(predictions) == len(gold_labels) config = AutoConfig.from_pretrained(sas_model_name_or_path) cross_encoder_used = False if config.architectures is not None: cross_encoder_used = any([ arch.endswith('ForSequenceClassification') for arch in config.architectures ]) # Compute similarities top_1_sas = [] top_k_sas = [] # Based on Modelstring we can load either Bi-Encoders or Cross Encoders. # Similarity computation changes for both approaches if cross_encoder_used: model = CrossEncoder(sas_model_name_or_path) for preds, labels in zip(predictions, gold_labels): # TODO add efficient batch mode: put all texts and labels into grid and extract scores afterwards grid = [] for p in preds: for l in labels: grid.append((p, l)) scores = model.predict(grid) top_1_sas.append(np.max(scores[:len(labels)])) top_k_sas.append(np.max(scores)) else: # For Bi-encoders we can flatten predictions and labels into one list model = SentenceTransformer(sas_model_name_or_path) lengths: List[Tuple[int, int]] = [] all_texts: List[str] = [] for p, l in zip(predictions, gold_labels): # type: ignore # TODO potentially exclude (near) exact matches from computations all_texts.extend(p) all_texts.extend(l) lengths.append((len(p), len(l))) # then compute embeddings embeddings = model.encode(all_texts) # then select which embeddings will be used for similarity computations current_position = 0 for i, (len_p, len_l) in enumerate(lengths): pred_embeddings = embeddings[current_position:current_position + len_p, :] current_position += len_p label_embeddings = embeddings[current_position:current_position + len_l, :] current_position += len_l sims = cosine_similarity(pred_embeddings, label_embeddings) top_1_sas.append(np.max(sims[0, :])) top_k_sas.append(np.max(sims)) return top_1_sas, top_k_sas
corpus_embeddings, top_k=num_candidates) hits = hits[0] #Get the hits for the first query print("Cosine-Similarity search took {:.3f} seconds".format(time.time() - start_time)) print("Top 5 hits with cosine-similarity:") for hit in hits[0:5]: print("\t{:.3f}\t{}".format(hit['score'], corpus_sentences[hit['corpus_id']])) #Now, do the re-ranking with the cross-encoder start_time = time.time() sentence_pairs = [[inp_question, corpus_sentences[hit['corpus_id']]] for hit in hits] ce_scores = cross_encoder_model.predict(sentence_pairs) for idx in range(len(hits)): hits[idx]['cross-encoder_score'] = ce_scores[idx] #Sort list by CrossEncoder scores hits = sorted(hits, key=lambda x: x['cross-encoder_score'], reverse=True) print("\nRe-ranking with Cross-Encoder took {:.3f} seconds".format( time.time() - start_time)) print("Top 5 hits with CrossEncoder:") for hit in hits[0:5]: print("\t{:.3f}\t{}".format(hit['cross-encoder_score'], corpus_sentences[hit['corpus_id']])) print("\n\n========\n")
## Some queries we want to search for in the document queries = [ "How large is Europe?", "Is Europe a continent?", "What is the currency in EU?", "Fall Roman Empire when", #We can also search for key word queries "Is Europa in the south part of the globe?" ] #Europe is miss-spelled & the matching sentences does not mention any of the content words #Search in a loop for the individual queries for query in queries: start_time = time.time() #Concatenate the query and all passages and predict the scores for the pairs [query, passage] model_inputs = [[query, passage] for passage in passages] scores = model.predict(model_inputs) #Sort the scores in decreasing order results = [{ 'input': inp, 'score': score } for inp, score in zip(model_inputs, scores)] results = sorted(results, key=lambda x: x['score'], reverse=True) print("Query:", query) print("Search took {:.2f} seconds".format(time.time() - start_time)) for hit in results[0:5]: print("Score: {:.2f}".format(hit['score']), "\t", hit['input'][1]) print("==========")
show_progress_bar=True) while True: query = input("Please enter a question: ") #Encode the query using the bi-encoder and find potentially relevant passages start_time = time.time() question_embedding = bi_encoder.encode(query, convert_to_tensor=True) hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k) hits = hits[0] # Get the hits for the first query #Now, score all retrieved passages with the cross_encoder cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits] cross_scores = cross_encoder.predict(cross_inp) #Sort results by the cross-encoder scores for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) end_time = time.time() #Output of top-5 hits print("Input question:", query) print("Results (after {:.3f} seconds):".format(end_time - start_time)) for hit in hits[0:5]: print("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']]))
def evaluate_answering( ground_truth, run, eval_missing_truth, sas_model_name_or_path="cross-encoder/stsb-roberta-large"): print("Evaluate: Question Answering") answering_run = get_answering_run(run) metric = load_metric("squad_v2") metric2 = load_metric("rouge") s = scorer.POSSCORE() # init POSSCORE sas_model = CrossEncoder(sas_model_name_or_path) result = {} answers = 0 posscores, sasscores = [], [] for turn in tqdm(ground_truth, desc=" "): turn_id = get_turn_id(turn) gt = turn["Truth_answer"] if eval_missing_truth or gt != "": reference = { "id": turn_id, "answers": { 'answer_start': [0], 'text': [gt] } } prediction_text = "" if turn_id in answering_run: prediction_text = answering_run[turn_id] answers = answers + 1 prediction = { "id": turn_id, "prediction_text": prediction_text, 'no_answer_probability': 0. } metric.add(prediction=prediction, reference=reference) metric2.add(prediction=prediction_text, reference=gt) ps = s.get_posscore(gt, prediction_text) if ps: posscores.append(ps) else: posscores.append(0) sas = sas_model.predict([(prediction_text, gt)]) sasscores.append(sas) if answers > 0: print(" used %d answers" % answers) score = metric.compute() score2 = metric2.compute() result["EM"] = score['exact'] / 100 result["F1"] = score['f1'] / 100 result["ROUGE1-R"] = score2['rouge1'].mid.recall result["POSSCORE"] = sum(posscores) / len( posscores) # average POSSCORE result["SAS"] = sum(sasscores) / len(sasscores) # average POSSCORE else: print(" skipped for no answers") return result
queries_result_list = [] run = {} model = CrossEncoder(sys.argv[1], max_length=512) for qid in tqdm.tqdm(relevant_qid): query = queries[qid] cand = passage_cand[qid] pids = [c[0] for c in cand] corpus_sentences = [c[1] for c in cand] cross_inp = [[query, sent] for sent in corpus_sentences] if model.config.num_labels > 1: #Cross-Encoder that predict more than 1 score, we use the last and apply softmax cross_scores = model.predict(cross_inp, apply_softmax=True)[:, 1].tolist() else: cross_scores = model.predict(cross_inp).tolist() cross_scores_sparse = {} for idx, pid in enumerate(pids): cross_scores_sparse[pid] = cross_scores[idx] sparse_scores = cross_scores_sparse run[qid] = {} for pid in sparse_scores: run[qid][pid] = float(sparse_scores[pid]) evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10'}) scores = evaluator.evaluate(run)
class MemNav: def __init__(self, root_dir='.'): """Load models, preprocess text, precompute embeddings.""" self.root_dir = root_dir # Load language models self.qa = pipeline('question-answering') self.sum = pipeline('summarization') self.text_encoder = SentenceTransformer('msmarco-distilbert-base-v2') self.pair_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6') # Load list of entries self.entries = [ open(self.root_dir + '/' + file).read() for file in sorted(os.listdir(root_dir)) ] # Tokenize entries into sentences self.entries = [sent_tokenize(entry.strip()) for entry in self.entries] # Merge each 3 consecutive sentences into one passage self.entries = list( chain(*[[ ' '.join(entry[start_idx:min(start_idx + 3, len(entry))]) for start_idx in range(0, len(entry), 3) ] for entry in self.entries])) # Pre-compute passage embeddings self.passage_embeddings = self.text_encoder.encode( self.entries, show_progress_bar=True) def retrieval(self, query): """Utility for retrieving passages most relevant to a given query.""" # First pass, find passages most similar to query question_embedding = self.text_encoder.encode(query, convert_to_tensor=True) hits = util.semantic_search(question_embedding, self.passage_embeddings, top_k=100)[0] # Second pass, re-rank passages more thoroughly cross_scores = self.pair_encoder.predict( [[query, self.entries[hit['corpus_id']]] for hit in hits]) for idx in range(len(cross_scores)): hits[idx]['cross-score'] = cross_scores[idx] # Select best few results hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True) results = [] for hit in hits[:5]: if hit['cross-score'] > 1e-3: results += [self.entries[hit['corpus_id']]] return results def search(self, query): """Search knowledge base for passages most relevant to a given query.""" print(*self.retrieval(query), sep='\n\n') def ask(self, question): """Obtain an answer to a question posed to the knowledge base. Provides retrieved passages as context for a question-answering pipeline.""" return self.qa(question, ' '.join(self.retrieval(question)))['answer'] def summarize(self, query): """Obtain a summary related to the query using the knowledge base. Provides retrieved passages as input for a summarization pipeline.""" return self.sum(' '.join(self.retrieval(query)), 130, 30, False)[0]['summary_text']
class Recommender: def __init__(self): self.encoder = SentenceTransformer("paraphrase-distilroberta-base-v1") self.cross_encoder = CrossEncoder( "cross-encoder/ms-marco-electra-base") def fit(self, **corpus: List[str]): """ fit the corpuses to be used for recommendations """ self.corpus_dict = corpus self.corpus_embeddings_dict = { key: self.encoder.encode(value, convert_to_tensor=True, show_progress_bar=True) for key, value in corpus.items() } def search(self, question: str, corpus: str, top_k: int) -> pd.DataFrame: """ semantic search """ assert ( corpus in self.corpus_dict ), "Corpus not found, please fit the corpus first using the .fit() call" question_embedding = self.encoder.encode(question, convert_to_tensor=True) hits = util.semantic_search(question_embedding, self.corpus_embeddings_dict[corpus], top_k=top_k).pop() # now, score all retrieved passages with the cross_encoder cross_inp = [[question, self.corpus_dict[corpus][hit["corpus_id"]]] for hit in hits] cross_scores = self.cross_encoder.predict(cross_inp) # sort results by the cross-encoder scores for idx in range(len(cross_scores)): hits[idx]["cross-score"] = cross_scores[idx] hits[idx]["snippet"] = self.corpus_dict[corpus][ hits[idx]["corpus_id"]].replace("\n", " ") hits = sorted(hits, key=lambda x: x["cross-score"], reverse=True) return pd.DataFrame(hits) def explore(self, query: str, top_k: int) -> pd.DataFrame: raise NotImplementedError @staticmethod def format_for_frontend(df: pd.DataFrame, hits: pd.DataFrame) -> pd.DataFrame: video_id_hits = df.iloc[hits.corpus_id].index.get_level_values(level=0) video_url_hits = [ f"https://www.youtube.com/watch?v={video_id}" for video_id in video_id_hits ] start_time = df.start_time.iloc[hits.corpus_id] end_time = df.end_time.iloc[hits.corpus_id] recommendations = pd.DataFrame({ "url": video_url_hits, "start": start_time, "end": end_time }).sort_values("start") recommendations = recommendations.groupby("url", as_index=False).agg({ "start": "min", "end": "max" }) return recommendations
class Model(torch.nn.Module): def __init__(self, hparams: HParams, dataset: Dataset): super().__init__() self.hparams = hparams self.dataset = dataset # pre-process data for tf-idf questions = [[w.lower() for w in word_tokenize(question)] for question in self.dataset.questions] self.dictionary = gensim.corpora.Dictionary(questions) corpus = [self.dictionary.doc2bow(question) for question in questions] # tf-idf self.tf_idf = gensim.models.TfidfModel(corpus) self.sims = gensim.similarities.MatrixSimilarity(self.tf_idf[corpus], num_features=len(self.dictionary)) # load model self.model_qq = SentenceTransformer(hparams.nearest_neighbor_model_qq) self.model_qa = SentenceTransformer(hparams.nearest_neighbor_model_qa) self.cross_encoder_qq = CrossEncoder(hparams.binary_classifier_model_qq) self.cross_encoder_qa = CrossEncoder(hparams.binary_classifier_model_qa) # generate embeddings for questions/answers self.embeddings_q = self.model_qq.encode(self.dataset.questions) self.embeddings_a = self.model_qa.encode(self.dataset.answers) def forward(self, query_batch: List[str]):# -> List[Answer]: # tf-idf query_docs = [[w.lower() for w in word_tokenize(query)] for query in query_batch] query_bows = [self.dictionary.doc2bow(query_doc) for query_doc in query_docs] scores_tf_idf = self.sims[self.tf_idf[query_bows]] # q-q, q-a models embedding_qq = self.model_qq.encode(query_batch) embedding_qa = self.model_qa.encode(query_batch) cosine_scores_qq = util.pytorch_cos_sim(embedding_qq, self.embeddings_q) cosine_scores_qa = util.pytorch_cos_sim(embedding_qa, self.embeddings_a) scores_qq = (cosine_scores_qq + 1) / 2 scores_qa = (cosine_scores_qa + 1) / 2 #cosine_scores = (1-self.hparams.weight_knn) * cosine_scores_qq + self.hparams.weight_knn * cosine_scores_qa scores = (1-self.hparams.weight_knn) * scores_qq + self.hparams.weight_knn * scores_qa scores = (1-self.hparams.weight_tf_idf) * scores + self.hparams.weight_tf_idf * scores_tf_idf #topk_scores, topk_indices = torch.topk(cosine_scores, self.hparams.k) topk_scores, topk_indices = torch.topk(scores, self.hparams.k) batch_qq = [] batch_qa = [] for i, query in enumerate(query_batch): for j in range(self.hparams.k): batch_qq.append((query, self.dataset.questions[topk_indices[i][j]])) batch_qa.append((query, self.dataset.answers[topk_indices[i][j]])) scores_qq = self.cross_encoder_qq.predict(batch_qq) scores_qa = self.cross_encoder_qa.predict(batch_qa) scores = (1-self.hparams.weight_binary_classifier) * scores_qq + self.hparams.weight_binary_classifier * scores_qa scores = np.reshape(scores, (len(query_batch), self.hparams.k)) #max_scores = np.max(scores, axis=1) max_indices = np.argmax(scores, axis=1) #answers = [] #for i in range(len(query_batch)): # max_score = max_scores[i] # max_index = max_indices[i] # if max_score > self.hparams.confidence_level: # ans = self.dataset.answers[topk_indices[i][max_index]] # else: # ans = None # answers.append(Answer(answer=ans, score=float(max_score))) return topk_indices, scores, max_indices
class CESemanticSimilarityMetric(MetricBase): """This metric computes the semantic similarity of two sentences using Cross Encoder model. By default the we use stsb-roberta-large model. see `https://github.com/UKPLab/sentence-transformers` for more information. """ def __init__(self, ce_pretrained_model="stsb-roberta-large", ce_gpu_id=-1, **kargs): """Initialize ce model.""" super(CESemanticSimilarityMetric, self).__init__() if ce_gpu_id == -1: logger.warning("CE metric is running on CPU.") device = "cpu" else: logger.info("CE metric is running on GPU %d.", ce_gpu_id) device = "cuda:%d" % ce_gpu_id logger.info("load ce model.") # TODO: use resources utils to manage model. self._model = CrossEncoder( resources.get_transformers(ce_pretrained_model), device=device) def _get_emb(self, sentences): """Compute the embedding of sentences.""" return self._model.encode(sentences) def measure_batch(self, origin, paraphrase_list, data_record=None, paraphrase_field="text0"): """Measure the metric on a batch of paraphrase_list. Args: origin (str): the original text. paraphrase_list (list): a set of paraphrase_list. data_record (dict): the corresponding data record of original text. paraphrase_field (str): the field name to paraphrase. Returns: (list): a list containing the USE similarity metric for each paraphrase. """ return [ float(x) for x in self._model.predict([(origin, paraphrase) for paraphrase in paraphrase_list]) ] def measure_example(self, origin, paraphrase, data_record=None, paraphrase_field="text0"): """Compute the perplexity ratio. Args: origin (str): original text. paraphrase (str): paraphrased text. data_record: ignored. paraphrase_field: ignored. """ return float(self._model.predict([(origin, paraphrase)])[0])