コード例 #1
0
class SentenceTransformersReranker(Reranker):
    def __init__(
            self,
            pretrained_model_name_or_path='cross-encoder/ms-marco-MiniLM-L-2-v2',
            max_length=512,
            device=None,
            use_amp=False):
        device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.use_amp = use_amp
        self.model = CrossEncoder(pretrained_model_name_or_path,
                                  max_length=max_length,
                                  device=device)

    def rescore(self, query: Query, texts: List[Text]) -> List[Text]:
        texts = deepcopy(texts)
        with torch.cuda.amp.autocast(enabled=self.use_amp):
            scores = self.model.predict(
                [(query.text, text.text) for text in texts],
                show_progress_bar=False,
            )

        for (text, score) in zip(texts, scores):
            text.score = score.item()

        return texts
コード例 #2
0
ファイル: utils.py プロジェクト: Youmna-H/chat-app
def get_sbert_mostsimilar_crossencoder(vecs, query, query_vec, num_responses):
    distances, most_similar = most_sim_cos(vecs, query_vec, 50)
    from sentence_transformers import CrossEncoder
    model = CrossEncoder('cross-encoder/stsb-roberta-base')
    cross_inp = [[query, i] for i in most_similar]
    cross_scores = model.predict(cross_inp)

    cross_scores, most_similar = zip(
        *sorted(zip(cross_scores, most_similar), reverse=True))

    return list(cross_scores[:num_responses]), list(
        most_similar[:num_responses])
コード例 #3
0
def get_answers_from_query(request):
    """
    Uses infromational retrieval methods to get answers from user query about the inputed text. These queries are answered using the BERT NLP transformer.
    Input
    ----------
    request variable: Flask request variable containing the text for the article
    Returns
    ----------
    The answers to the query.
    """
    text = request.form['text']
    text = check_for_url(text)

    sentences = nltk.sent_tokenize(text)
    sentences = [s for s in sentences if s[-1] != "?"]
    query = request.form['query']
    '''
    This uses a Cross Encoder variant of a transformer.
    It is designed to return the most likely response given an input.
    i.e - its designed for question answering
    '''
    model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2')

    model_inputs = [[query, passage] for passage in sentences]
    print(model_inputs)
    scores = model.predict(model_inputs)

    #Sort the scores in decreasing order
    results = [{
        'input': inp,
        'score': score
    } for inp, score in zip(model_inputs, scores)]
    results = sorted(results, key=lambda x: x['score'], reverse=True)

    answers = []
    print("Query:", query)
    for hit in results[0:3]:
        print("Score: {:.2f}".format(hit['score']), "\t", hit['input'][1],
              '\n')
        if hit['score'] > 0.0:
            answers.append(hit['input'][1])
    return answers
コード例 #4
0
def _get_relevant_comments_helper(comments, query, query_embedding,
                                  corpus_embeddings):
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=10)
    hits = hits[0]

    cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

    cross_inp = [[query, comments[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)

    #print top 10 hits
    # for hit in hits[:10]:
    #print(hit['score'], comments[hit['corpus_id']])

    return hits[:10]
コード例 #5
0
pickle.dump(
    ranker,
    open("ranker/constrastive_loss/ranker_contrastive_loss_20_epochs.pkl",
         "wb"))

from tqdm.notebook import tqdm

run = {}
for topic in tqdm(topics):
    number = topic["number"]
    query = topic["title"]

    extracted_ids = [k for k in qrel[number].keys()]

    doc_ids = []
    for id in extracted_ids:
        try:
            db.lookup_docno(id)
            doc_ids.append(id)
        except:
            continue

    texts = db.batch_docno_lookup(doc_ids)
    pairs = list(zip([query] * len(texts), texts))

    scores = ranker.predict(pairs)
    scores = scores.tolist()

    doc_scores = dict(zip(doc_ids, scores))
    run[number] = doc_scores
print_cum_stats(run)
コード例 #6
0
ファイル: eval.py プロジェクト: stmnk/haystack
def semantic_answer_similarity(
    predictions: List[List[str]],
    gold_labels: List[List[str]],
    sas_model_name_or_path:
    str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
) -> Tuple[List[float], List[float]]:
    """
    Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.
    Returns per QA pair a) the similarity of the most likely prediction (top 1) to all available gold labels
                        b) the highest similarity of all predictions to gold labels

    :param predictions: Predicted answers as list of multiple preds per question
    :param gold_labels: Labels as list of multiple possible answers per question
    :param sas_model_name_or_path: SentenceTransformers semantic textual similarity model, should be path or string
                                     pointing to downloadable models.


    :return top_1_sas, top_k_sas
    """
    assert len(predictions) == len(gold_labels)

    config = AutoConfig.from_pretrained(sas_model_name_or_path)
    cross_encoder_used = False
    if config.architectures is not None:
        cross_encoder_used = any([
            arch.endswith('ForSequenceClassification')
            for arch in config.architectures
        ])

    # Compute similarities
    top_1_sas = []
    top_k_sas = []

    # Based on Modelstring we can load either Bi-Encoders or Cross Encoders.
    # Similarity computation changes for both approaches
    if cross_encoder_used:
        model = CrossEncoder(sas_model_name_or_path)
        for preds, labels in zip(predictions, gold_labels):
            # TODO add efficient batch mode: put all texts and labels into grid and extract scores afterwards
            grid = []
            for p in preds:
                for l in labels:
                    grid.append((p, l))
            scores = model.predict(grid)
            top_1_sas.append(np.max(scores[:len(labels)]))
            top_k_sas.append(np.max(scores))
    else:
        # For Bi-encoders we can flatten predictions and labels into one list
        model = SentenceTransformer(sas_model_name_or_path)
        lengths: List[Tuple[int, int]] = []
        all_texts: List[str] = []
        for p, l in zip(predictions, gold_labels):  # type: ignore
            # TODO potentially exclude (near) exact matches from computations
            all_texts.extend(p)
            all_texts.extend(l)
            lengths.append((len(p), len(l)))
        # then compute embeddings
        embeddings = model.encode(all_texts)

        # then select which embeddings will be used for similarity computations
        current_position = 0
        for i, (len_p, len_l) in enumerate(lengths):
            pred_embeddings = embeddings[current_position:current_position +
                                         len_p, :]
            current_position += len_p
            label_embeddings = embeddings[current_position:current_position +
                                          len_l, :]
            current_position += len_l
            sims = cosine_similarity(pred_embeddings, label_embeddings)
            top_1_sas.append(np.max(sims[0, :]))
            top_k_sas.append(np.max(sims))

    return top_1_sas, top_k_sas
コード例 #7
0
                                corpus_embeddings,
                                top_k=num_candidates)
    hits = hits[0]  #Get the hits for the first query

    print("Cosine-Similarity search took {:.3f} seconds".format(time.time() -
                                                                start_time))
    print("Top 5 hits with cosine-similarity:")
    for hit in hits[0:5]:
        print("\t{:.3f}\t{}".format(hit['score'],
                                    corpus_sentences[hit['corpus_id']]))

    #Now, do the re-ranking with the cross-encoder
    start_time = time.time()
    sentence_pairs = [[inp_question, corpus_sentences[hit['corpus_id']]]
                      for hit in hits]
    ce_scores = cross_encoder_model.predict(sentence_pairs)

    for idx in range(len(hits)):
        hits[idx]['cross-encoder_score'] = ce_scores[idx]

    #Sort list by CrossEncoder scores
    hits = sorted(hits, key=lambda x: x['cross-encoder_score'], reverse=True)
    print("\nRe-ranking with Cross-Encoder took {:.3f} seconds".format(
        time.time() - start_time))
    print("Top 5 hits with CrossEncoder:")
    for hit in hits[0:5]:
        print("\t{:.3f}\t{}".format(hit['cross-encoder_score'],
                                    corpus_sentences[hit['corpus_id']]))

    print("\n\n========\n")
## Some queries we want to search for in the document
queries = [
    "How large is Europe?",
    "Is Europe a continent?",
    "What is the currency in EU?",
    "Fall Roman Empire when",  #We can also search for key word queries
    "Is Europa in the south part of the globe?"
]  #Europe is miss-spelled & the matching sentences does not mention any of the content words

#Search in a loop for the individual queries
for query in queries:
    start_time = time.time()

    #Concatenate the query and all passages and predict the scores for the pairs [query, passage]
    model_inputs = [[query, passage] for passage in passages]
    scores = model.predict(model_inputs)

    #Sort the scores in decreasing order
    results = [{
        'input': inp,
        'score': score
    } for inp, score in zip(model_inputs, scores)]
    results = sorted(results, key=lambda x: x['score'], reverse=True)

    print("Query:", query)
    print("Search took {:.2f} seconds".format(time.time() - start_time))
    for hit in results[0:5]:
        print("Score: {:.2f}".format(hit['score']), "\t", hit['input'][1])

    print("==========")
コード例 #9
0
                                          show_progress_bar=True)

while True:
    query = input("Please enter a question: ")

    #Encode the query using the bi-encoder and find potentially relevant passages
    start_time = time.time()
    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
    hits = util.semantic_search(question_embedding,
                                corpus_embeddings,
                                top_k=top_k)
    hits = hits[0]  # Get the hits for the first query

    #Now, score all retrieved passages with the cross_encoder
    cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    #Sort results by the cross-encoder scores
    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
    end_time = time.time()

    #Output of top-5 hits
    print("Input question:", query)
    print("Results (after {:.3f} seconds):".format(end_time - start_time))
    for hit in hits[0:5]:
        print("\t{:.3f}\t{}".format(hit['cross-score'],
                                    passages[hit['corpus_id']]))
コード例 #10
0
def evaluate_answering(
        ground_truth,
        run,
        eval_missing_truth,
        sas_model_name_or_path="cross-encoder/stsb-roberta-large"):
    print("Evaluate: Question Answering")

    answering_run = get_answering_run(run)
    metric = load_metric("squad_v2")
    metric2 = load_metric("rouge")
    s = scorer.POSSCORE()  # init POSSCORE
    sas_model = CrossEncoder(sas_model_name_or_path)

    result = {}
    answers = 0
    posscores, sasscores = [], []

    for turn in tqdm(ground_truth, desc="  "):
        turn_id = get_turn_id(turn)
        gt = turn["Truth_answer"]
        if eval_missing_truth or gt != "":
            reference = {
                "id": turn_id,
                "answers": {
                    'answer_start': [0],
                    'text': [gt]
                }
            }
            prediction_text = ""
            if turn_id in answering_run:
                prediction_text = answering_run[turn_id]
                answers = answers + 1
            prediction = {
                "id": turn_id,
                "prediction_text": prediction_text,
                'no_answer_probability': 0.
            }
            metric.add(prediction=prediction, reference=reference)

            metric2.add(prediction=prediction_text, reference=gt)

            ps = s.get_posscore(gt, prediction_text)
            if ps:
                posscores.append(ps)
            else:
                posscores.append(0)

            sas = sas_model.predict([(prediction_text, gt)])
            sasscores.append(sas)

    if answers > 0:
        print("    used %d answers" % answers)
        score = metric.compute()
        score2 = metric2.compute()

        result["EM"] = score['exact'] / 100
        result["F1"] = score['f1'] / 100
        result["ROUGE1-R"] = score2['rouge1'].mid.recall
        result["POSSCORE"] = sum(posscores) / len(
            posscores)  # average POSSCORE
        result["SAS"] = sum(sasscores) / len(sasscores)  # average POSSCORE
    else:
        print("    skipped for no answers")
    return result
コード例 #11
0
queries_result_list = []
run = {}
model = CrossEncoder(sys.argv[1], max_length=512)

for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1:  #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp,
                                     apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()

    cross_scores_sparse = {}
    for idx, pid in enumerate(pids):
        cross_scores_sparse[pid] = cross_scores[idx]

    sparse_scores = cross_scores_sparse
    run[qid] = {}
    for pid in sparse_scores:
        run[qid][pid] = float(sparse_scores[pid])

evaluator = pytrec_eval.RelevanceEvaluator(relevant_docs, {'ndcg_cut.10'})
scores = evaluator.evaluate(run)
コード例 #12
0
class MemNav:
    def __init__(self, root_dir='.'):
        """Load models, preprocess text, precompute embeddings."""
        self.root_dir = root_dir

        # Load language models
        self.qa = pipeline('question-answering')
        self.sum = pipeline('summarization')
        self.text_encoder = SentenceTransformer('msmarco-distilbert-base-v2')
        self.pair_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

        # Load list of entries
        self.entries = [
            open(self.root_dir + '/' + file).read()
            for file in sorted(os.listdir(root_dir))
        ]

        # Tokenize entries into sentences
        self.entries = [sent_tokenize(entry.strip()) for entry in self.entries]

        # Merge each 3 consecutive sentences into one passage
        self.entries = list(
            chain(*[[
                ' '.join(entry[start_idx:min(start_idx + 3, len(entry))])
                for start_idx in range(0, len(entry), 3)
            ] for entry in self.entries]))

        # Pre-compute passage embeddings
        self.passage_embeddings = self.text_encoder.encode(
            self.entries, show_progress_bar=True)

    def retrieval(self, query):
        """Utility for retrieving passages most relevant to a given query."""
        # First pass, find passages most similar to query
        question_embedding = self.text_encoder.encode(query,
                                                      convert_to_tensor=True)
        hits = util.semantic_search(question_embedding,
                                    self.passage_embeddings,
                                    top_k=100)[0]

        # Second pass, re-rank passages more thoroughly
        cross_scores = self.pair_encoder.predict(
            [[query, self.entries[hit['corpus_id']]] for hit in hits])

        for idx in range(len(cross_scores)):
            hits[idx]['cross-score'] = cross_scores[idx]

        # Select best few results
        hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)

        results = []
        for hit in hits[:5]:
            if hit['cross-score'] > 1e-3:
                results += [self.entries[hit['corpus_id']]]

        return results

    def search(self, query):
        """Search knowledge base for passages most relevant to a given query."""
        print(*self.retrieval(query), sep='\n\n')

    def ask(self, question):
        """Obtain an answer to a question posed to the knowledge base. Provides retrieved passages as context for a question-answering pipeline."""
        return self.qa(question, ' '.join(self.retrieval(question)))['answer']

    def summarize(self, query):
        """Obtain a summary related to the query using the knowledge base. Provides retrieved passages as input for a summarization pipeline."""
        return self.sum(' '.join(self.retrieval(query)), 130, 30,
                        False)[0]['summary_text']
コード例 #13
0
class Recommender:
    def __init__(self):
        self.encoder = SentenceTransformer("paraphrase-distilroberta-base-v1")
        self.cross_encoder = CrossEncoder(
            "cross-encoder/ms-marco-electra-base")

    def fit(self, **corpus: List[str]):
        """
        fit the corpuses to be used for recommendations
        """
        self.corpus_dict = corpus
        self.corpus_embeddings_dict = {
            key: self.encoder.encode(value,
                                     convert_to_tensor=True,
                                     show_progress_bar=True)
            for key, value in corpus.items()
        }

    def search(self, question: str, corpus: str, top_k: int) -> pd.DataFrame:
        """
        semantic search
        """
        assert (
            corpus in self.corpus_dict
        ), "Corpus not found, please fit the corpus first using the .fit() call"
        question_embedding = self.encoder.encode(question,
                                                 convert_to_tensor=True)
        hits = util.semantic_search(question_embedding,
                                    self.corpus_embeddings_dict[corpus],
                                    top_k=top_k).pop()

        # now, score all retrieved passages with the cross_encoder
        cross_inp = [[question, self.corpus_dict[corpus][hit["corpus_id"]]]
                     for hit in hits]
        cross_scores = self.cross_encoder.predict(cross_inp)

        # sort results by the cross-encoder scores
        for idx in range(len(cross_scores)):
            hits[idx]["cross-score"] = cross_scores[idx]
            hits[idx]["snippet"] = self.corpus_dict[corpus][
                hits[idx]["corpus_id"]].replace("\n", " ")
        hits = sorted(hits, key=lambda x: x["cross-score"], reverse=True)
        return pd.DataFrame(hits)

    def explore(self, query: str, top_k: int) -> pd.DataFrame:
        raise NotImplementedError

    @staticmethod
    def format_for_frontend(df: pd.DataFrame,
                            hits: pd.DataFrame) -> pd.DataFrame:
        video_id_hits = df.iloc[hits.corpus_id].index.get_level_values(level=0)
        video_url_hits = [
            f"https://www.youtube.com/watch?v={video_id}"
            for video_id in video_id_hits
        ]
        start_time = df.start_time.iloc[hits.corpus_id]
        end_time = df.end_time.iloc[hits.corpus_id]
        recommendations = pd.DataFrame({
            "url": video_url_hits,
            "start": start_time,
            "end": end_time
        }).sort_values("start")
        recommendations = recommendations.groupby("url", as_index=False).agg({
            "start":
            "min",
            "end":
            "max"
        })
        return recommendations
コード例 #14
0
class Model(torch.nn.Module):
    def __init__(self, hparams: HParams, dataset: Dataset):
        super().__init__()
        self.hparams = hparams
        self.dataset = dataset

        # pre-process data for tf-idf
        questions = [[w.lower() for w in word_tokenize(question)]
                                    for question in self.dataset.questions]
        self.dictionary = gensim.corpora.Dictionary(questions)
        corpus = [self.dictionary.doc2bow(question) for question in questions]

        # tf-idf
        self.tf_idf = gensim.models.TfidfModel(corpus)
        self.sims = gensim.similarities.MatrixSimilarity(self.tf_idf[corpus], num_features=len(self.dictionary))

        # load model
        self.model_qq = SentenceTransformer(hparams.nearest_neighbor_model_qq)
        self.model_qa = SentenceTransformer(hparams.nearest_neighbor_model_qa)
        self.cross_encoder_qq = CrossEncoder(hparams.binary_classifier_model_qq)
        self.cross_encoder_qa = CrossEncoder(hparams.binary_classifier_model_qa)

        # generate embeddings for questions/answers
        self.embeddings_q = self.model_qq.encode(self.dataset.questions)
        self.embeddings_a = self.model_qa.encode(self.dataset.answers)

    def forward(self, query_batch: List[str]):# -> List[Answer]:
        # tf-idf
        query_docs = [[w.lower() for w in word_tokenize(query)] for query in query_batch]
        query_bows = [self.dictionary.doc2bow(query_doc) for query_doc in query_docs]
        scores_tf_idf = self.sims[self.tf_idf[query_bows]]

        # q-q, q-a models
        embedding_qq = self.model_qq.encode(query_batch)
        embedding_qa = self.model_qa.encode(query_batch)

        cosine_scores_qq = util.pytorch_cos_sim(embedding_qq, self.embeddings_q)
        cosine_scores_qa = util.pytorch_cos_sim(embedding_qa, self.embeddings_a)
        scores_qq = (cosine_scores_qq + 1) / 2
        scores_qa = (cosine_scores_qa + 1) / 2
        #cosine_scores = (1-self.hparams.weight_knn) * cosine_scores_qq + self.hparams.weight_knn * cosine_scores_qa
        scores = (1-self.hparams.weight_knn) * scores_qq + self.hparams.weight_knn * scores_qa
        scores = (1-self.hparams.weight_tf_idf) * scores + self.hparams.weight_tf_idf * scores_tf_idf

        #topk_scores, topk_indices = torch.topk(cosine_scores, self.hparams.k)
        topk_scores, topk_indices = torch.topk(scores, self.hparams.k)

        batch_qq = []
        batch_qa = []
        for i, query in enumerate(query_batch):
            for j in range(self.hparams.k):
                batch_qq.append((query, self.dataset.questions[topk_indices[i][j]]))
                batch_qa.append((query, self.dataset.answers[topk_indices[i][j]]))
        scores_qq = self.cross_encoder_qq.predict(batch_qq)
        scores_qa = self.cross_encoder_qa.predict(batch_qa)
        scores = (1-self.hparams.weight_binary_classifier) * scores_qq + self.hparams.weight_binary_classifier * scores_qa
        scores = np.reshape(scores, (len(query_batch), self.hparams.k))

        #max_scores = np.max(scores, axis=1)
        max_indices = np.argmax(scores, axis=1)
        #answers = []
        #for i in range(len(query_batch)):
        #    max_score = max_scores[i]
        #    max_index = max_indices[i]
        #    if max_score > self.hparams.confidence_level:
        #        ans = self.dataset.answers[topk_indices[i][max_index]]
        #    else:
        #        ans = None
        #    answers.append(Answer(answer=ans, score=float(max_score)))

        return topk_indices, scores, max_indices
コード例 #15
0
class CESemanticSimilarityMetric(MetricBase):
    """This metric computes the semantic similarity of two sentences using Cross Encoder model.

    By default the we use stsb-roberta-large model.

    see `https://github.com/UKPLab/sentence-transformers` for more information.
    """
    def __init__(self,
                 ce_pretrained_model="stsb-roberta-large",
                 ce_gpu_id=-1,
                 **kargs):
        """Initialize ce model."""
        super(CESemanticSimilarityMetric, self).__init__()

        if ce_gpu_id == -1:
            logger.warning("CE metric is running on CPU.")
            device = "cpu"
        else:
            logger.info("CE metric is running on GPU %d.", ce_gpu_id)
            device = "cuda:%d" % ce_gpu_id

        logger.info("load ce model.")

        # TODO: use resources utils to manage model.

        self._model = CrossEncoder(
            resources.get_transformers(ce_pretrained_model), device=device)

    def _get_emb(self, sentences):
        """Compute the embedding of sentences."""
        return self._model.encode(sentences)

    def measure_batch(self,
                      origin,
                      paraphrase_list,
                      data_record=None,
                      paraphrase_field="text0"):
        """Measure the metric on a batch of paraphrase_list.

        Args:
            origin (str): the original text.
            paraphrase_list (list): a set of paraphrase_list.
            data_record (dict): the corresponding data record of original text.
            paraphrase_field (str): the field name to paraphrase.

        Returns:
            (list): a list containing the USE similarity metric for each paraphrase.
        """
        return [
            float(x)
            for x in self._model.predict([(origin, paraphrase)
                                          for paraphrase in paraphrase_list])
        ]

    def measure_example(self,
                        origin,
                        paraphrase,
                        data_record=None,
                        paraphrase_field="text0"):
        """Compute the perplexity ratio.

        Args:
            origin (str): original text.
            paraphrase (str): paraphrased text.
            data_record: ignored.
            paraphrase_field: ignored.
        """
        return float(self._model.predict([(origin, paraphrase)])[0])