Esempio n. 1
1
def evaluate_ce(num_labels, dataset_name, evalset_name, evaluator, threshold):
    model = CrossEncoder('ssce_save/fsce/' + dataset_name,
                         num_labels=num_labels)

    with open(evalset_name, 'r') as r:
        OOD_data = r.readlines()

    OOD_sentence_pairs = []
    OOD_labels = []
    for line in OOD_data:
        pair = line.strip('\n').split('\t')
        new_entry = []
        try:
            new_entry.append([pair[0], pair[1]])
        except:
            continue
        try:
            new_entry.append(int(pair[2]))
        except:
            continue
        OOD_sentence_pairs.append(new_entry[0])
        OOD_labels.append(new_entry[1])

    if evaluator == 'accuracy':
        OOD_evaluator = CEBinaryAccuracyEvaluator(OOD_sentence_pairs,
                                                  OOD_labels,
                                                  threshold=threshold)
    elif evaluator == 'classification':
        OOD_evaluator = CEBinaryClassificationEvaluator(
            OOD_sentence_pairs, OOD_labels)

    OOD_evaluator(model=model, output_path='ssce_save/fsce/' + dataset_name)
Esempio n. 2
0
    def __init__(self, root_dir='.'):
        """Load models, preprocess text, precompute embeddings."""
        self.root_dir = root_dir

        # Load language models
        self.qa = pipeline('question-answering')
        self.sum = pipeline('summarization')
        self.text_encoder = SentenceTransformer('msmarco-distilbert-base-v2')
        self.pair_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

        # Load list of entries
        self.entries = [
            open(self.root_dir + '/' + file).read()
            for file in sorted(os.listdir(root_dir))
        ]

        # Tokenize entries into sentences
        self.entries = [sent_tokenize(entry.strip()) for entry in self.entries]

        # Merge each 3 consecutive sentences into one passage
        self.entries = list(
            chain(*[[
                ' '.join(entry[start_idx:min(start_idx + 3, len(entry))])
                for start_idx in range(0, len(entry), 3)
            ] for entry in self.entries]))

        # Pre-compute passage embeddings
        self.passage_embeddings = self.text_encoder.encode(
            self.entries, show_progress_bar=True)
 def test_train_stsb(self):
     model = CrossEncoder('distilroberta-base', num_labels=1)
     train_dataloader = DataLoader(self.stsb_train_samples, shuffle=True, batch_size=16)
     model.fit(train_dataloader=train_dataloader,
               epochs=1,
               warmup_steps=int(len(train_dataloader)*0.1))
     self.evaluate_stsb_test(model, 75)
def stratifiedkfoldtest(data):
	data = data.sample(frac=1,random_state=1).reset_index(drop=True)
	skf = StratifiedKFold(n_splits=10)
	splits=[(x,y) for x,y in skf.split(data, data['label'])]
	f1list=[]
	acclist=[]
	import torch
	torch.cuda.empty_cache()
	t = torch.cuda.get_device_properties(0).total_memory
	r = torch.cuda.memory_reserved(0) 
	a = torch.cuda.memory_allocated(0)
	f = r-a  # free inside reserved
	print(f"Total:{t/1e+9}, Reserved:{r}, Allocated:{a}, Free:{f}")
	for b in [24]:
	  for l in [2e-5]:
	    for e in [4]:
	      for train_index, test_index in splits:
	        #resetting the model for every fold
	        model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1)
	        #train split
	        train=data.loc[train_index]
	        #test split
	        test=data.loc[test_index]
	        #data loaders
	        train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model)
	        test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model)
	        train_=DataLoader(train_,batch_size=b)
	        test_=DataLoader(test_)
	        #loss function
	        #training
	        model.fit(train_,epochs=e,optimizer_params={'lr':l})
	        #predictions using encoder similarity
	        y=test['label']
	        dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1))
	        yh=sts_sim(dlist,model)
	        #f1
	        f1scores,thresholds=f1_macro(y,yh)
	        print(np.nan in f1scores)
	        f1=max(f1scores)
	        f1list.append(f1)
	        print(f1)
	        #accuracy
	        mthres=thresholds[np.nanargmax(f1scores)]
	        yh1=np.zeros(len(yh))
	        yh1[yh>=mthres]=1
	        f12=metrics.f1_score(y,yh1,average='macro')
	        if f12!=f1:
	        	import pdb
	        	pdb.set_trace()
	        acc=metrics.accuracy_score(y, yh1)
	        print(acc)
	        acclist.append(acc)
	      print(b,l,e)
	      print("Average Macro F1 across folds:",np.mean(f1list))
	      print("Average Acc across folds:",np.mean(acclist))
def kfoldtest(data):
	data = data.sample(frac=1,random_state=1).reset_index(drop=True)
	skf = KFold(n_splits=100)
	splits=[(x,y) for x,y in skf.split(data)]
	f1list=[]
	acclist=[]
	import torch
	print(torch.cuda.is_available())
	for b in [20]:
		for l in [2e-5]:
			for e in [4]:
				yh=np.array([])
				y=np.array([])
				i=0
				for train_index, test_index in splits:
					i+=1
					print(f"Fold {i}")
					#resetting the model for every fold
					model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1)
					#train split
					train=data.loc[train_index]
					#test split
					test=data.loc[test_index]
					#data loaders
					train_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in train.iterrows()],model)
					test_=SentencesDataset([InputExample(texts=[d['query_p'],d['citation_p']],label=int(d['label'])) for i,d in test.iterrows()],model)
					train_=DataLoader(train_,batch_size=b)
					test_=DataLoader(test_)
					#training
					model.fit(train_,epochs=e,optimizer_params={'lr':l})
					#predictions using cos_similarity
					y=np.append(y,test['label'])
					dlist=list(test.apply(lambda d:(d['query_p'],d['citation_p']), axis=1))
					yh=np.append(yh,sts_sim(dlist,model))
				#f1
				f1scores,thresholds=f1_macro(y,yh)
				print(np.nan in f1scores)
				f1=max(f1scores)
				f1list.append(f1)
				print(f1)
				#accuracy
				mthres=thresholds[np.nanargmax(f1scores)]
				yh1=np.zeros(len(yh))
				yh1[yh>=mthres]=1
				f12=metrics.f1_score(y,yh1,average='macro')
				if f12!=f1:
					import pdb
					pdb.set_trace()
				acc=metrics.accuracy_score(y, yh1)
				print(acc)
				acclist.append(acc)
				print(b,l,e)
				print("BERT Fine-Tuned: Average F1 across folds:",np.mean(f1list))
				print("BERT Fine-Tuned: Average Acc across folds:",np.mean(acclist))
Esempio n. 6
0
 def __init__(
         self,
         pretrained_model_name_or_path='cross-encoder/ms-marco-MiniLM-L-2-v2',
         max_length=512,
         device=None,
         use_amp=False):
     device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
     self.use_amp = use_amp
     self.model = CrossEncoder(pretrained_model_name_or_path,
                               max_length=max_length,
                               device=device)
Esempio n. 7
0
def get_sbert_mostsimilar_crossencoder(vecs, query, query_vec, num_responses):
    distances, most_similar = most_sim_cos(vecs, query_vec, 50)
    from sentence_transformers import CrossEncoder
    model = CrossEncoder('cross-encoder/stsb-roberta-base')
    cross_inp = [[query, i] for i in most_similar]
    cross_scores = model.predict(cross_inp)

    cross_scores, most_similar = zip(
        *sorted(zip(cross_scores, most_similar), reverse=True))

    return list(cross_scores[:num_responses]), list(
        most_similar[:num_responses])
Esempio n. 8
0
class SentenceTransformersReranker(Reranker):
    def __init__(
            self,
            pretrained_model_name_or_path='cross-encoder/ms-marco-MiniLM-L-2-v2',
            max_length=512,
            device=None,
            use_amp=False):
        device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.use_amp = use_amp
        self.model = CrossEncoder(pretrained_model_name_or_path,
                                  max_length=max_length,
                                  device=device)

    def rescore(self, query: Query, texts: List[Text]) -> List[Text]:
        texts = deepcopy(texts)
        with torch.cuda.amp.autocast(enabled=self.use_amp):
            scores = self.model.predict(
                [(query.text, text.text) for text in texts],
                show_progress_bar=False,
            )

        for (text, score) in zip(texts, scores):
            text.score = score.item()

        return texts
Esempio n. 9
0
def get_answers_from_query(request):
    """
    Uses infromational retrieval methods to get answers from user query about the inputed text. These queries are answered using the BERT NLP transformer.
    Input
    ----------
    request variable: Flask request variable containing the text for the article
    Returns
    ----------
    The answers to the query.
    """
    text = request.form['text']
    text = check_for_url(text)

    sentences = nltk.sent_tokenize(text)
    sentences = [s for s in sentences if s[-1] != "?"]
    query = request.form['query']
    '''
    This uses a Cross Encoder variant of a transformer.
    It is designed to return the most likely response given an input.
    i.e - its designed for question answering
    '''
    model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2')

    model_inputs = [[query, passage] for passage in sentences]
    print(model_inputs)
    scores = model.predict(model_inputs)

    #Sort the scores in decreasing order
    results = [{
        'input': inp,
        'score': score
    } for inp, score in zip(model_inputs, scores)]
    results = sorted(results, key=lambda x: x['score'], reverse=True)

    answers = []
    print("Query:", query)
    for hit in results[0:3]:
        print("Score: {:.2f}".format(hit['score']), "\t", hit['input'][1],
              '\n')
        if hit['score'] > 0.0:
            answers.append(hit['input'][1])
    return answers
Esempio n. 10
0
    def __init__(self,
                 ce_pretrained_model="stsb-roberta-large",
                 ce_gpu_id=-1,
                 **kargs):
        """Initialize ce model."""
        super(CESemanticSimilarityMetric, self).__init__()

        if ce_gpu_id == -1:
            logger.warning("CE metric is running on CPU.")
            device = "cpu"
        else:
            logger.info("CE metric is running on GPU %d.", ce_gpu_id)
            device = "cuda:%d" % ce_gpu_id

        logger.info("load ce model.")

        # TODO: use resources utils to manage model.

        self._model = CrossEncoder(
            resources.get_transformers(ce_pretrained_model), device=device)
Esempio n. 11
0
def _get_relevant_comments_helper(comments, query, query_embedding,
                                  corpus_embeddings):
    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=10)
    hits = hits[0]

    cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

    cross_inp = [[query, comments[hit['corpus_id']]] for hit in hits]
    cross_scores = cross_encoder.predict(cross_inp)

    for idx in range(len(cross_scores)):
        hits[idx]['cross-score'] = cross_scores[idx]

    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)

    #print top 10 hits
    # for hit in hits[:10]:
    #print(hit['score'], comments[hit['corpus_id']])

    return hits[:10]
Esempio n. 12
0
    def __init__(self, hparams: HParams, dataset: Dataset):
        super().__init__()
        self.hparams = hparams
        self.dataset = dataset

        # pre-process data for tf-idf
        questions = [[w.lower() for w in word_tokenize(question)]
                                    for question in self.dataset.questions]
        self.dictionary = gensim.corpora.Dictionary(questions)
        corpus = [self.dictionary.doc2bow(question) for question in questions]

        # tf-idf
        self.tf_idf = gensim.models.TfidfModel(corpus)
        self.sims = gensim.similarities.MatrixSimilarity(self.tf_idf[corpus], num_features=len(self.dictionary))

        # load model
        self.model_qq = SentenceTransformer(hparams.nearest_neighbor_model_qq)
        self.model_qa = SentenceTransformer(hparams.nearest_neighbor_model_qa)
        self.cross_encoder_qq = CrossEncoder(hparams.binary_classifier_model_qq)
        self.cross_encoder_qa = CrossEncoder(hparams.binary_classifier_model_qa)

        # generate embeddings for questions/answers
        self.embeddings_q = self.model_qq.encode(self.dataset.questions)
        self.embeddings_a = self.model_qa.encode(self.dataset.answers)
def bert(data):
	model=CrossEncoder('cross-encoder/stsb-roberta-base',num_labels=1)
	dlist=list(data.apply(lambda d:(d['query_p'],d['citation_p']), axis=1))
	y=data['label']
	yh=sts_sim(dlist,model)
	#f1
	f1scores,thresholds=f1_macro(y,yh)
	print(np.nan in f1scores)
	f1=max(f1scores)
	#accuracy
	mthres=thresholds[np.nanargmax(f1scores)]
	yh1=np.zeros(len(yh))
	yh1[yh>=mthres]=1
	f12=metrics.f1_score(y,yh1,average='macro')
	if f12!=f1:
		import pdb
		pdb.set_trace()
	acc=metrics.accuracy_score(y, yh1)
	print("BERT: Macro F1:",f1)
	print("BERT: Accuracy:",acc)
def print_cum_stats(run):
    run_results = evaluator.evaluate(run)

    map_scores = [v["map"] for k, v in run_results.items()]
    p_scores = [v["P_5"] for k, v in run_results.items()]
    ndcg_scores = [v['ndcg'] for k, v in run_results.items()]

    print("Aggregate results")
    print("Average MAP: ", np.mean(map_scores))
    print("Average P_5: ", np.mean(p_scores))
    print("Average NDCG: ", np.mean(ndcg_scores))


from sentence_transformers import CrossEncoder
ranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

# Base loss
from sentence_transformers import SentencesDataset, losses
from sentence_transformers.readers import InputExample

examples = []

for topic in topics:
    gold = qrel[topic["number"]].items()
    query = topic["title"].strip()

    for item in gold:
        try:
            doc = db.lookup_docno(item[0])
            examples.append(InputExample(texts=[query, doc], label=item[1]))
 def test_pretrained_stsb(self):
     model = CrossEncoder("cross-encoder/stsb-distilroberta-base")
     self.evaluate_stsb_test(model, 87.92)
def topic_modelling(df, model):
    for aspect in aspects.keys():
        df[aspect] = df.full_message.apply(
            lambda x: score_topic_sentence(x, aspect=aspect))

    df["best_aspect"] = "None"
    df["max_score"] = df[aspects.keys()].max(axis=1)
    for aspect in aspects.keys():
        df.loc[(df[aspect] == df["max_score"]) & (df["max_score"] > 0.1),
               "best_aspect"] = aspect
    df = df.drop(columns="max_score")
    return df


def score_topic_sentence(sentence, aspect="food"):
    aspect_description = aspects[aspect]
    score = model.predict((sentence, aspect_description))
    return score


if __name__ == "__main__":
    path = "data/evaluation/"
    file_name = "text_data.txt"
    df = read_dirty_test_file(path + file_name)

    # Scoring Reviews against each topic
    model = CrossEncoder('cross-encoder/stsb-roberta-base')
    df = topic_modelling(df, model)

    df.to_csv(path + "TEST_data_with_Topics.csv")
Esempio n. 17
0
def semantic_answer_similarity(
    predictions: List[List[str]],
    gold_labels: List[List[str]],
    sas_model_name_or_path:
    str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
) -> Tuple[List[float], List[float]]:
    """
    Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1.
    Returns per QA pair a) the similarity of the most likely prediction (top 1) to all available gold labels
                        b) the highest similarity of all predictions to gold labels

    :param predictions: Predicted answers as list of multiple preds per question
    :param gold_labels: Labels as list of multiple possible answers per question
    :param sas_model_name_or_path: SentenceTransformers semantic textual similarity model, should be path or string
                                     pointing to downloadable models.


    :return top_1_sas, top_k_sas
    """
    assert len(predictions) == len(gold_labels)

    config = AutoConfig.from_pretrained(sas_model_name_or_path)
    cross_encoder_used = False
    if config.architectures is not None:
        cross_encoder_used = any([
            arch.endswith('ForSequenceClassification')
            for arch in config.architectures
        ])

    # Compute similarities
    top_1_sas = []
    top_k_sas = []

    # Based on Modelstring we can load either Bi-Encoders or Cross Encoders.
    # Similarity computation changes for both approaches
    if cross_encoder_used:
        model = CrossEncoder(sas_model_name_or_path)
        for preds, labels in zip(predictions, gold_labels):
            # TODO add efficient batch mode: put all texts and labels into grid and extract scores afterwards
            grid = []
            for p in preds:
                for l in labels:
                    grid.append((p, l))
            scores = model.predict(grid)
            top_1_sas.append(np.max(scores[:len(labels)]))
            top_k_sas.append(np.max(scores))
    else:
        # For Bi-encoders we can flatten predictions and labels into one list
        model = SentenceTransformer(sas_model_name_or_path)
        lengths: List[Tuple[int, int]] = []
        all_texts: List[str] = []
        for p, l in zip(predictions, gold_labels):  # type: ignore
            # TODO potentially exclude (near) exact matches from computations
            all_texts.extend(p)
            all_texts.extend(l)
            lengths.append((len(p), len(l)))
        # then compute embeddings
        embeddings = model.encode(all_texts)

        # then select which embeddings will be used for similarity computations
        current_position = 0
        for i, (len_p, len_l) in enumerate(lengths):
            pred_embeddings = embeddings[current_position:current_position +
                                         len_p, :]
            current_position += len_p
            label_embeddings = embeddings[current_position:current_position +
                                          len_l, :]
            current_position += len_l
            sims = cosine_similarity(pred_embeddings, label_embeddings)
            top_1_sas.append(np.max(sims[0, :]))
            top_k_sas.append(np.max(sims))

    return top_1_sas, top_k_sas
Esempio n. 18
0
#Lager values: More context from the paragraph remains, but results are longer
window_size = 3
passages = []
for paragraph in paragraphs:
    for start_idx in range(0, len(paragraph), window_size):
        end_idx = min(start_idx+window_size, len(paragraph))
        passages.append(" ".join(paragraph[start_idx:end_idx]))


print("Paragraphs: ", len(paragraphs))
print("Sentences: ", sum([len(p) for p in paragraphs]))
print("Passages: ", len(passages))


## Load our cross-encoder. Use fast tokenizer to speed up the tokenization
model = CrossEncoder('sentence-transformers/ce-ms-marco-TinyBERT-L-2')

## Some queries we want to search for in the document
queries = ["How large is Europe?",
           "Is Europe a continent?",
           "What is the currency in EU?",
           "Fall Roman Empire when",                    #We can also search for key word queries
           "Is Europa in the south part of the globe?"]   #Europe is miss-spelled & the matching sentences does not mention any of the content words

#Search in a loop for the individual queries
for query in queries:
    start_time = time.time()

    #Concatenate the query and all passages and predict the scores for the pairs [query, passage]
    model_inputs = [[query, passage] for passage in passages]
    scores = model.predict(model_inputs)
Esempio n. 19
0
from sentence_transformers import CrossEncoder
import os
import csv
import pickle
import time
import sys

# We use a BiEncoder (SentenceTransformer) that produces embeddings for questions.
# We then search for similar questions using cosine similarity and identify the top 100 most similar questions
model_name = 'paraphrase-MiniLM-L6-v2'
model = SentenceTransformer(model_name)
num_candidates = 500

# To refine the results, we use a CrossEncoder. A CrossEncoder gets both inputs (input_question, retrieved_question)
# and outputs a score 0...1 indicating the similarity.
cross_encoder_model = CrossEncoder('cross-encoder/roberta-base-stsb')

# Dataset we want to use
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 20000

# Some local file to cache computed embeddings
embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format(
    model_name.replace('/', '_'), max_corpus_size)

#Check if embedding cache path exists
if not os.path.exists(embedding_cache_path):
    # Check if the dataset exists. If not, download and extract
    # Download dataset if needed
    if not os.path.exists(dataset_path):
from sentence_transformers.readers import InputExample
from sentence_transformers.cross_encoder.evaluation import CEBinaryClassificationEvaluator
from torch.utils.data import DataLoader
from torch import nn

class sigmoid_cond_loss():
	def __init__():

    loss = -tf.reduce_mean(pred*tf.log(pred+NEAR_0)+(1-pred)*tf.log(1-pred+NEAR_0))
    return loss

class virtual_adversarial_loss():
	def __init__(pred):


roberta = CrossEncoder('cross-encoder/stsb-roberta-base', num_labels = 1)
tokens = ["<e1>", "<e2>"]
roberta.tokenizer.add_tokens(tokens, special_tokens=True)
roberta.model.resize_token_embeddings(len(roberta.tokenizer))

with open('fewrel_tag/pairwise_labeled_train.tsv','r') as r:
    labeled_data = r.readlines()

with open('fewrel_tag/pairwise_test.tsv', 'r') as r:
	test_data = r.readlines()

train_examples = [] 

for line in labeled_data:
	pair = line.strip('\n').split('\t')
	try:
Google Colab Example: https://colab.research.google.com/drive/1l6stpYdRMmeDBK_vw0L5NitdiAuhdsAr?usp=sharing
"""
import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import time
import gzip
import os
import torch

#We use the Bi-Encoder to encode all passages, so that we can use it with sematic search
model_name = 'msmarco-distilbert-base-v2'
bi_encoder = SentenceTransformer(model_name)
top_k = 100  #Number of passages we want to retrieve with the bi-encoder

#The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
cross_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

# As dataset, we use Simple English Wikipedia. Compared to the full English wikipedia, it has only
# about 170k articles. We split these articles into paragraphs and encode them with the bi-encoder

wikipedia_filepath = 'data/simplewiki-2020-11-01.jsonl.gz'

if not os.path.exists(wikipedia_filepath):
    util.http_get('http://sbert.net/datasets/simplewiki-2020-11-01.jsonl.gz',
                  wikipedia_filepath)

passages = []
with gzip.open(wikipedia_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        data = json.loads(line.strip())
        passages.extend(data['paragraphs'])
Esempio n. 22
0
formatter = logging.Formatter(config.get_string('logging.pattern', default='%(asctime)s [%(levelname)s] %(message)s'))
if config.get_bool('logging.appenders.console.enabled', True):
    ch = logging.StreamHandler()
    ch.setFormatter(formatter)
    rootLogger.addHandler(ch)
if config.get_bool('logging.appenders.file.enabled', True):
    fh = logging.FileHandler(config.get_string("logging.appenders.file.file-name"))
    fh.setFormatter(formatter)
    rootLogger.addHandler(fh)

app = Quart(__name__)
app = cors(app, allow_origin="*")

t0 = perf_counter()
model = SentenceTransformer(config.get_string('ss_search.bi-encoder-model'))
cross_encoder = CrossEncoder(config.get_string('ss_search.cross-encoder-model'))


def suggest_question():
    return random.choice(all_questions)


@app.route('/api/suggest')
async def suggest():
    return {'question': suggest_question()}


@app.route('/api/compare')
async def compare():
    if 'q1' not in request.args or 'q2' not in request.args:
        return abort(400, description='Missing required parameters')
Esempio n. 23
0
 def __init__(self, model=None):
     self.model = model
     if (self.model is None):
         self.model = CrossEncoder(
             'sentence-transformers/ce-ms-marco-electra-base',
             max_length=512)
 def test_pretrained_stsb(self):
     model = CrossEncoder(
         "sentence-transformers/ce-distilroberta-base-stsb")
     self.evaluate_stsb_test(model, 87.92)
Esempio n. 25
0
#We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
#Smaller value: Context from other sentences might get lost
#Lager values: More context from the paragraph remains, but results are longer
window_size = 3
passages = []
for paragraph in paragraphs:
    for start_idx in range(0, len(paragraph), window_size):
        end_idx = min(start_idx + window_size, len(paragraph))
        passages.append(" ".join(paragraph[start_idx:end_idx]))

print("Paragraphs: ", len(paragraphs))
print("Sentences: ", sum([len(p) for p in paragraphs]))
print("Passages: ", len(passages))

## Load our cross-encoder. Use fast tokenizer to speed up the tokenization
model = CrossEncoder('sentence-transformers/ce-ms-marco-TinyBERT-L-2',
                     use_fast_tokenizer=True)

## Some queries we want to search for in the document
queries = [
    "How large is Europe?",
    "Is Europe a continent?",
    "What is the currency in EU?",
    "Fall Roman Empire when",  #We can also search for key word queries
    "Is Europa in the south part of the globe?"
]  #Europe is miss-spelled & the matching sentences does not mention any of the content words

#Search in a loop for the individual queries
for query in queries:
    start_time = time.time()

    #Concatenate the query and all passages and predict the scores for the pairs [query, passage]
Esempio n. 26
0
        passage_filepath)

passage_cand = {}
with gzip.open(passage_filepath, 'rt', encoding='utf8') as fIn:
    for line in fIn:
        qid, pid, query, passage = line.strip().split("\t")
        if qid not in passage_cand:
            passage_cand[qid] = []

        passage_cand[qid].append([pid, passage])

logging.info("Queries: {}".format(len(queries)))

queries_result_list = []
run = {}
model = CrossEncoder(sys.argv[1], max_length=512)

for qid in tqdm.tqdm(relevant_qid):
    query = queries[qid]

    cand = passage_cand[qid]
    pids = [c[0] for c in cand]
    corpus_sentences = [c[1] for c in cand]

    cross_inp = [[query, sent] for sent in corpus_sentences]

    if model.config.num_labels > 1:  #Cross-Encoder that predict more than 1 score, we use the last and apply softmax
        cross_scores = model.predict(cross_inp,
                                     apply_softmax=True)[:, 1].tolist()
    else:
        cross_scores = model.predict(cross_inp).tolist()
Esempio n. 27
0
class MemNav:
    def __init__(self, root_dir='.'):
        """Load models, preprocess text, precompute embeddings."""
        self.root_dir = root_dir

        # Load language models
        self.qa = pipeline('question-answering')
        self.sum = pipeline('summarization')
        self.text_encoder = SentenceTransformer('msmarco-distilbert-base-v2')
        self.pair_encoder = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-6')

        # Load list of entries
        self.entries = [
            open(self.root_dir + '/' + file).read()
            for file in sorted(os.listdir(root_dir))
        ]

        # Tokenize entries into sentences
        self.entries = [sent_tokenize(entry.strip()) for entry in self.entries]

        # Merge each 3 consecutive sentences into one passage
        self.entries = list(
            chain(*[[
                ' '.join(entry[start_idx:min(start_idx + 3, len(entry))])
                for start_idx in range(0, len(entry), 3)
            ] for entry in self.entries]))

        # Pre-compute passage embeddings
        self.passage_embeddings = self.text_encoder.encode(
            self.entries, show_progress_bar=True)

    def retrieval(self, query):
        """Utility for retrieving passages most relevant to a given query."""
        # First pass, find passages most similar to query
        question_embedding = self.text_encoder.encode(query,
                                                      convert_to_tensor=True)
        hits = util.semantic_search(question_embedding,
                                    self.passage_embeddings,
                                    top_k=100)[0]

        # Second pass, re-rank passages more thoroughly
        cross_scores = self.pair_encoder.predict(
            [[query, self.entries[hit['corpus_id']]] for hit in hits])

        for idx in range(len(cross_scores)):
            hits[idx]['cross-score'] = cross_scores[idx]

        # Select best few results
        hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)

        results = []
        for hit in hits[:5]:
            if hit['cross-score'] > 1e-3:
                results += [self.entries[hit['corpus_id']]]

        return results

    def search(self, query):
        """Search knowledge base for passages most relevant to a given query."""
        print(*self.retrieval(query), sep='\n\n')

    def ask(self, question):
        """Obtain an answer to a question posed to the knowledge base. Provides retrieved passages as context for a question-answering pipeline."""
        return self.qa(question, ' '.join(self.retrieval(question)))['answer']

    def summarize(self, query):
        """Obtain a summary related to the query using the knowledge base. Provides retrieved passages as input for a summarization pipeline."""
        return self.sum(' '.join(self.retrieval(query)), 130, 30,
                        False)[0]['summary_text']
#We combine up to 3 sentences into a passage. You can choose smaller or larger values for window_size
#Smaller value: Context from other sentences might get lost
#Lager values: More context from the paragraph remains, but results are longer
window_size = 3
passages = []
for paragraph in paragraphs:
    for start_idx in range(0, len(paragraph), window_size):
        end_idx = min(start_idx + window_size, len(paragraph))
        passages.append(" ".join(paragraph[start_idx:end_idx]))

print("Paragraphs: ", len(paragraphs))
print("Sentences: ", sum([len(p) for p in paragraphs]))
print("Passages: ", len(passages))

## Load our cross-encoder. Use fast tokenizer to speed up the tokenization
model = CrossEncoder('cross-encoder/ms-marco-TinyBERT-L-2')

## Some queries we want to search for in the document
queries = [
    "How large is Europe?",
    "Is Europe a continent?",
    "What is the currency in EU?",
    "Fall Roman Empire when",  #We can also search for key word queries
    "Is Europa in the south part of the globe?"
]  #Europe is miss-spelled & the matching sentences does not mention any of the content words

#Search in a loop for the individual queries
for query in queries:
    start_time = time.time()

    #Concatenate the query and all passages and predict the scores for the pairs [query, passage]
def evaluate_answering(
        ground_truth,
        run,
        eval_missing_truth,
        sas_model_name_or_path="cross-encoder/stsb-roberta-large"):
    print("Evaluate: Question Answering")

    answering_run = get_answering_run(run)
    metric = load_metric("squad_v2")
    metric2 = load_metric("rouge")
    s = scorer.POSSCORE()  # init POSSCORE
    sas_model = CrossEncoder(sas_model_name_or_path)

    result = {}
    answers = 0
    posscores, sasscores = [], []

    for turn in tqdm(ground_truth, desc="  "):
        turn_id = get_turn_id(turn)
        gt = turn["Truth_answer"]
        if eval_missing_truth or gt != "":
            reference = {
                "id": turn_id,
                "answers": {
                    'answer_start': [0],
                    'text': [gt]
                }
            }
            prediction_text = ""
            if turn_id in answering_run:
                prediction_text = answering_run[turn_id]
                answers = answers + 1
            prediction = {
                "id": turn_id,
                "prediction_text": prediction_text,
                'no_answer_probability': 0.
            }
            metric.add(prediction=prediction, reference=reference)

            metric2.add(prediction=prediction_text, reference=gt)

            ps = s.get_posscore(gt, prediction_text)
            if ps:
                posscores.append(ps)
            else:
                posscores.append(0)

            sas = sas_model.predict([(prediction_text, gt)])
            sasscores.append(sas)

    if answers > 0:
        print("    used %d answers" % answers)
        score = metric.compute()
        score2 = metric2.compute()

        result["EM"] = score['exact'] / 100
        result["F1"] = score['f1'] / 100
        result["ROUGE1-R"] = score2['rouge1'].mid.recall
        result["POSSCORE"] = sum(posscores) / len(
            posscores)  # average POSSCORE
        result["SAS"] = sum(sasscores) / len(sasscores)  # average POSSCORE
    else:
        print("    skipped for no answers")
    return result
from sentence_transformers import CrossEncoder
import os
import csv
import pickle
import time
import sys

# We use a BiEncoder (SentenceTransformer) that produces embeddings for questions.
# We then search for similar questions using cosine similarity and identify the top 100 most similar questions
model_name = 'distilbert-multilingual-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)
num_candidates = 500

# To refine the results, we use a CrossEncoder. A CrossEncoder gets both inputs (input_question, retrieved_question)
# and outputs a score 0...1 indicating the similarity.
cross_encoder_model = CrossEncoder('sentence-transformers/ce-roberta-base-stsb')

# Dataset we want to use
url = "http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv"
dataset_path = "quora_duplicate_questions.tsv"
max_corpus_size = 20000

# Some local file to cache computed embeddings
embedding_cache_path = 'quora-embeddings-{}-size-{}.pkl'.format(model_name.replace('/', '_'), max_corpus_size)

#Check if embedding cache path exists
if not os.path.exists(embedding_cache_path):
    # Check if the dataset exists. If not, download and extract
    # Download dataset if needed
    if not os.path.exists(dataset_path):
        print("Download dataset")