def __init__(self, nlp): self.bm25_ranking = bm25(nlp) self.tfidf_ranking = tfidf(nlp) self.sbert_ranking = sbert() self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-6") self.kg = KnowledgeGraph('chatbot', 'password') self.document = None
def construct_model(base_model, encoder_style): # word_embedding_model = models.Transformer(base_model, max_seq_length=256) # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) # model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) if encoder_style == BIENCODER: model = SentenceTransformer(base_model) train_loss = losses.CosineSimilarityLoss(model) elif encoder_style == CROSSENCODER: model = CrossEncoder(base_model, num_labels=1, max_length=512) train_loss = None return model, train_loss
dev_samples.append( InputExample(texts=[row['sentence_1'], row['sentence_2']], label=clazz)) elif row['split'] == 'test': test_samples.append( InputExample(texts=[row['sentence_1'], row['sentence_2']], label=clazz)) train_batch_size = 16 num_epochs = 100 model_name = 'distilroberta-base' #distilbert-base-uncased model_save_path = 'output/training_unfolding_structure-' + datetime.now( ).strftime("%Y-%m-%d_%H-%M-%S") + '_' + model_name #Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels model = CrossEncoder(model_name, num_labels=1) #We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader train_dataloader = DataLoader(train_samples, shuffle=False, batch_size=train_batch_size) evaluator = CEBinaryClassificationEvaluator.from_input_examples( dev_samples, name='UnfoldingStructure-dev') warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model model.fit(train_dataloader=train_dataloader,
config = yaml.safe_load(open('config.yml', 'r')) os.environ["TORCH_HOME"] = config['base_model_dir'] num_labels = 2 logging.info("Processing Data ...") if config['use_hypernym']: train_samples, dev_samples = get_train_dev_data( config, os.path.join(config['train_dir'], config['train_hyp_file'])) num_labels = 3 else: train_samples, dev_samples = get_train_dev_data( config, os.path.join(config['train_dir'], config['train_flat_file'])) logging.info("Done Processing Data ...") model = CrossEncoder(config['crossencoder_base_model'], num_labels=num_labels) batch_size = config['batch_size'] num_epochs = config['num_epochs'] train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=batch_size) train_loss = get_loss(config['loss_type'], model) evaluator = CEBinaryAccuracyEvaluator.from_input_examples(dev_samples) warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) logging.info("Warmup-steps: {}".format(warmup_steps)) model_dir = os.path.join(config['saved_model_dir'], config['checkpoint_path'])
#Check if dataset exsist. If not, download and extract it sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) #Define our Cross-Encoder train_batch_size = 16 num_epochs = 4 model_save_path = 'output/training_stsbenchmark-' + datetime.now().strftime( "%Y-%m-%d_%H-%M-%S") #We use distilroberta-base as base model and set num_labels=1, which predicts a continous score between 0 and 1 model = CrossEncoder('distilroberta-base', num_labels=1) # Read STSb dataset logger.info("Read STSbenchmark train dataset") train_samples = [] dev_samples = [] test_samples = [] with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) for row in reader: score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 if row['split'] == 'dev': dev_samples.append( InputExample(texts=[row['sentence1'], row['sentence2']],
#First, we define the transformer model we want to fine-tune model_name = 'google/electra-small-discriminator' train_batch_size = 32 num_epochs = 1 model_save_path = 'output/training_ms-marco_cross-encoder-' + model_name.replace( "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") # We train the network with as a binary label task # Given [query, passage] is the label 0 = irrelevant or 1 = relevant? # We use a positive-to-negative ratio: For 1 positive sample (label 1) we include 4 negative samples (label 0) # in our training setup. For the negative samples, we use the triplets provided by MS Marco that # specify (query, positive sample, negative sample). pos_neg_ration = 4 #We set num_labels=1, which predicts a continous score between 0 and 1 model = CrossEncoder(model_name, num_labels=1, max_length=512) ### Now we read the MS Marco dataset data_folder = 'msmarco-data' os.makedirs(data_folder, exist_ok=True) #### Read the corpus files, that contain all the passages. Store them in the corpus dict corpus = {} collection_filepath = os.path.join(data_folder, 'collection.tsv') if not os.path.exists(collection_filepath): tar_filepath = os.path.join(data_folder, 'collection.tar.gz') if not os.path.exists(tar_filepath): logging.info("Download collection.tar.gz") util.http_get( 'https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz', tar_filepath)
""" This example computes the score between a query and all possible sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS). It output then the most similar sentences for the given query. """ from sentence_transformers.cross_encoder import CrossEncoder import numpy as np # Pre-trained cross encoder model = CrossEncoder('sentence-transformers/ce-distilroberta-base-stsb') # We want to compute the similarity between the query sentence query = 'A man is eating pasta.' # With all sentences in the corpus corpus = [ 'A man is eating food.', 'A man is eating a piece of bread.', 'The girl is carrying a baby.', 'A man is riding a horse.', 'A woman is playing violin.', 'Two men pushed carts through the woods.', 'A man is riding a white horse on an enclosed ground.', 'A monkey is playing drums.', 'A cheetah is running behind its prey.' ] # So we create the respective sentence combinations sentence_combinations = [[query, corpus_sentence] for corpus_sentence in corpus] # Compute the similarity scores for these combinations similarity_scores = model.predict(sentence_combinations) # Sort the scores in decreasing order
handlers=[LoggingHandler()]) config = yaml.safe_load(open('config.yml', 'r')) num_labels = 2 if config['use_hyperynm']: num_labels = 3 test_sets = config['test_files'] batch_size = config['batch_size'] results_dir = config['results_dir'] eval_dir = config['eval_dir'] logging.info("Loading Model ...") model = CrossEncoder(os.path.join(config['saved_model_dir'], config['eval_base']), num_labels=num_labels) logging.info("Done Loading Model ...") for test_set in test_sets: test_name = test_set.split('.')[0] logging.info("Reading " + test_name + " Data") test_data, all_sentences, all_definitions = get_test_data( os.path.join(eval_dir, test_set), True) logging.info("Computing and Writing " + test_name + " Scores") scores = get_crossencoder_scores(all_sentences, all_definitions, batch_size, model) populate_scores(test_data, scores) scores_dict = compute_test_metrics(test_data, False) out_dir = os.path.join(results_dir, config['eval_base']) if not os.path.exists(out_dir):
def __init__(self, file_path, pilot, service, use_cuda=False, cuda_device=-1, annotation_model=None, section_split_model=None): ''' PathwayGenerator object constructor Args: path (str): path of the file from which the pathway is generated. pilot (str): name of the pilot. service (str): name of the service considered. use_cuda (bool): flag to use gpu model. cuda_device (int, optional): Id of the gpu device to use. Defaults to -1. ''' assert file_path is not None, "A file path is required" languages = { 'Larissa': 'el', 'Birmingham': 'en', 'Malaga': 'es', 'Palermo': 'it' } self.path = file_path if os.path.splitext(self.path)[-1] == '.txt': self.converted_file = doc2txt.purge_urls( open(self.path, 'r').read(), os.path.splitext(self.path)[0]) self.use_cuda = use_cuda self.cuda_device = cuda_device self.language = languages[pilot] # TODO: language detection param? if len(annotation_model) != 2: self.annotation_model = Transner( pretrained_model=annotation_model, use_cuda=use_cuda, cuda_device=cuda_device, language_detection=True, threshold=0.85, args={"use_multiprocessing": False}) else: self.annotation_model = Transner( pretrained_model='bert_uncased_' + annotation_model, use_cuda=use_cuda, cuda_device=cuda_device, language_detection=True, threshold=0.85, args={"use_multiprocessing": False}) self.section_split_model = CrossEncoder(section_split_model, num_labels=1) self.annotation_metadata = metadata = pilot + ' - ' + service + ' - ' + os.path.basename( self.path) #self.generation_metadata = { # 'where': pilot + ' - ' + service + ' - ' + 'Where - ' + os.path.basename(self.path) + ' - ', # 'when': pilot + ' - ' + service + ' - ' + 'When - ' + os.path.basename(self.path) + ' - ', # 'how': pilot + ' - ' + service + ' - ' + 'How - ' + os.path.basename(self.path) + ' - ' #} self.generation_metadata = pilot + ' - ' + service + ' - ' + os.path.basename( self.path) + ' - '
depth=depth, batch_size=batch_size) return all_scores, psg_indices if __name__ == "__main__": args = parser.parse_args() all_scores, psg_indices = do_first_stage_retrieval( args.biencoder_query_reps, args.biencoder_passage_reps, depth=args.first_stage_depth, batch_size=args.first_stage_batch_size) model = CrossEncoder(args.crossencoder_model_directory, num_labels=1, max_length=512) query_texts = [] for row in jsonlines.open(args.test_queries): query_texts.append(row["text"]) dataset_texts, dataset_ids = read_dataset_collection( args.search_collection) all_query_dataset_pairs = [] for i, query in enumerate(query_texts): first_stage_doc_idxs = [int(docidx) for docidx in psg_indices[i]] first_stage_dataset_texts = [ dataset_texts[docidx] for docidx in first_stage_doc_idxs ]
class PathwayGenerator(): def __init__(self, file_path, pilot, service, use_cuda=False, cuda_device=-1, annotation_model=None, section_split_model=None): ''' PathwayGenerator object constructor Args: path (str): path of the file from which the pathway is generated. pilot (str): name of the pilot. service (str): name of the service considered. use_cuda (bool): flag to use gpu model. cuda_device (int, optional): Id of the gpu device to use. Defaults to -1. ''' assert file_path is not None, "A file path is required" languages = { 'Larissa': 'el', 'Birmingham': 'en', 'Malaga': 'es', 'Palermo': 'it' } self.path = file_path if os.path.splitext(self.path)[-1] == '.txt': self.converted_file = doc2txt.purge_urls( open(self.path, 'r').read(), os.path.splitext(self.path)[0]) self.use_cuda = use_cuda self.cuda_device = cuda_device self.language = languages[pilot] # TODO: language detection param? if len(annotation_model) != 2: self.annotation_model = Transner( pretrained_model=annotation_model, use_cuda=use_cuda, cuda_device=cuda_device, language_detection=True, threshold=0.85, args={"use_multiprocessing": False}) else: self.annotation_model = Transner( pretrained_model='bert_uncased_' + annotation_model, use_cuda=use_cuda, cuda_device=cuda_device, language_detection=True, threshold=0.85, args={"use_multiprocessing": False}) self.section_split_model = CrossEncoder(section_split_model, num_labels=1) self.annotation_metadata = metadata = pilot + ' - ' + service + ' - ' + os.path.basename( self.path) #self.generation_metadata = { # 'where': pilot + ' - ' + service + ' - ' + 'Where - ' + os.path.basename(self.path) + ' - ', # 'when': pilot + ' - ' + service + ' - ' + 'When - ' + os.path.basename(self.path) + ' - ', # 'how': pilot + ' - ' + service + ' - ' + 'How - ' + os.path.basename(self.path) + ' - ' #} self.generation_metadata = pilot + ' - ' + service + ' - ' + os.path.basename( self.path) + ' - ' def to_list(self): element_list = [] # Make an empty list for element in re.split('\n', self.converted_file): stripped_element = element.strip() if stripped_element != '': element_list.append( stripped_element) #Append to list the striped element return element_list def do_convert(self): self.converted_file = doc2txt.convert_to_txt(self.path) return self.converted_file def do_split(self, threshold=0.5): sentence_list = self.to_list() scores = [] for i in range(0, len(sentence_list) - 1): current_sentence = sentence_list[i] next_sentence = sentence_list[i + 1] score = self.section_split_model.predict( [current_sentence, next_sentence]) scores.append(score) sections = [ ] # sections = [['section1'], ['section2'], ... , ['sectionN']] section_text = [] section_text.append(sentence_list[0]) for i in range(0, len(scores)): if scores[i] >= threshold: section_text.append(sentence_list[i + 1]) else: sections.append(section_text) section_text = [] section_text.append(sentence_list[i + 1]) sections.append(section_text) return sections def do_annotate(self, sentence_list): self.ner_dict = self.annotation_model.ner(sentence_list, apply_regex=True) if self.language in ['es', 'en']: self.ner_dict = self.annotate_sutime(self.ner_dict) else: self.ner_dict = self.annotation_model.find_dates(self.ner_dict) self.ner_dict = annotator.aggregate_dict(self.ner_dict) self.ner_dict['entities'] = sorted(self.ner_dict['entities'], key=lambda ner: ner['start_offset']) self.ner_dict = annotator.resolve_uri_entities(self.ner_dict, self.path) return self.ner_dict def do_generate(self): if os.path.splitext(self.path)[-1] == '.json': self.ner_dict = json.load(open(self.path, 'r')) aggregated_ner_dict = aggregator.aggregate_entities(self.ner_dict) print(aggregated_ner_dict) #aggregated_ner_dict = self.ner_dict = {'text': 'test 1 of the section 1.\ntest 2 of the section 1.\ntest 3 of the section 1.\n', 'entities': {'LOCATION': [{'value': 'test', 'confidence': 0.9737, 'start_offset': 0, 'end_offset': 4}], 'ORGANIZATION': [{'value': 'test', 'confidence': 0.9676, 'start_offset': 25, 'end_offset': 29}], 'TIME': [{'value': 'test', 'confidence': 0.9573, 'start_offset': 50, 'end_offset': 54}]}} json_pathway = generator.generate(aggregated_ner_dict) mapped_entities = json.loads(json_pathway) dict_pathway = json.load(open("tools/dict_pathway.json", 'r')) self.pathway = {} #{'physical_office': [{'start', 'end'}...]} for key, sub_types in dict_pathway.items(): self.pathway[key] = {} for sub_type in sub_types: self.pathway[key][sub_type] = [] for entity in mapped_entities: self.pathway[self.keys_of_value( dict_pathway, entity['step'])][entity['step']].append(entity) # {'dove': [], 'come': [], 'quando': []} #todo: remove return because we can read the value in the pgr object return self.pathway def export_annotation_to_doccano(self, add_confidence=False): filename = os.path.splitext(self.path)[0] doccano_dict = {} doccano_dict['text'] = self.ner_dict['text'] doccano_dict['labels'] = [] doccano_dict['meta'] = self.annotation_metadata for item in self.ner_dict['entities']: if add_confidence: doccano_dict['labels'].append([ item['start_offset'], item['end_offset'], item['type'], item['confidence'] ]) else: doccano_dict['labels'].append( [item['start_offset'], item['end_offset'], item['type']]) file_out = open(filename + '_ner.jsonl', 'w', encoding='utf-8') file_out.write(json.dumps(doccano_dict)) file_out.write('\n') return doccano_dict, filename + '_ner.jsonl' def export_generation_to_doccano(self, pathway=None): dict_translations = json.load(open("tools/dict_translations.json", 'r')) filename = os.path.splitext(self.path)[0] pathway_jsonl = [] for key in pathway: tmp_dict = {"text": '', "labels": [], "meta": ''} tmp_dict["text"] = key for step, step_dict in pathway[key].items(): tmp_dict["meta"] = self.generation_metadata + key for sub_type, entities in step_dict.items(): label = dict_translations[ self.language][step] + ' - ' + dict_translations[ self.language][sub_type] + ': ' if len(entities) == 0: label = label + '-' tmp_dict['labels'].append(label) else: for entity in entities: label = label + entity['entity'].strip() + ' , ' tmp_dict['labels'].append(label[:-2].strip()) pathway_jsonl.append(tmp_dict) file_out = open(filename + '_pathway.jsonl', 'w', encoding='utf-8') return_string = '' for element in pathway_jsonl: string_element = str(json.dumps(element, ensure_ascii=False)) file_out.write(string_element) file_out.write('\n') return_string = return_string + string_element + '\n' return return_string, filename + '_pathway.jsonl' def keys_of_value(self, dct, value): for k in dct: if isinstance(dct[k], list): if value in dct[k]: return k else: if value == dct[k]: return k def annotate_sutime(self, ner_dict): for item in ner_dict: text = item['sentence'] jar_files = os.path.join('python-sutime/', 'jars') sutime = sutime_mod.SUTime(jars=jar_files, mark_time_ranges=True) json = sutime.parse(text) time_type = self.annotation_model.check_opening_time( item['entities']) for item_sutime in json: if not self.annotation_model.find_overlap( item['entities'], item_sutime['start'], item_sutime['end']): item['entities'].append({ 'type': time_type, 'value': item_sutime['text'], 'confidence': 0.85, 'offset': item_sutime['start'] }) return ner_dict def sections_to_doccano(self, sections): count, step = 0, 1 doccano_dict = {'text': '', 'labels': []} for section in sections: initial_count, final_count = count, 0 for sentence in section: doccano_dict['text'] = doccano_dict['text'] + sentence + '.\n' final_count = final_count + len(sentence) + 2 doccano_dict['labels'].append([ initial_count, initial_count + final_count - 1, 'Step' + str(step) ]) step = step + 1 count = initial_count + final_count return doccano_dict
SentenceTransformer, util) from sentence_transformers.cross_encoder import CrossEncoder from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator from sklearn.metrics import balanced_accuracy_score from torch.utils.data import DataLoader pd.set_option("display.max_rows", 200) es_host = 'localhost:9200' bi_model_path = os.path.join(os.path.dirname("__file__"), os.path.pardir, "bi_encoder_save/") bi_model = SentenceTransformer(bi_model_path, device="cpu") cross_model_path = "output/training_ms-marco_cross-encoder-xlm-roberta-base-2021-01-17_14-43-23_map-train-eval" cross_model = CrossEncoder(cross_model_path, num_labels=1, max_length=512, device="cpu") class es_pandas_edit(es_pandas): @staticmethod def serialize(row, columns, use_pandas_json, iso_dates): if use_pandas_json: return json.dumps(dict(zip(columns, row)), iso_dates=iso_dates) return dict( zip(columns, [ None if (all(pd.isna(r)) if (hasattr(r, "__len__") and type(r) != type("")) else pd.isna(r)) else r for r in row ]))
datefmt='%Y-%m-%d %H:%M:%S', level=logging.INFO, handlers=[LoggingHandler()]) #### /print debug information to stdout #First, we define the transformer model we want to fine-tune model_name = 'microsoft/MiniLM-L12-H384-uncased' train_batch_size = 32 num_epochs = 1 model_save_path = 'output/training_ms-marco_cross-encoder-v2-'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") #We set num_labels=1 and set the activation function to Identiy, so that we get the raw logits model = CrossEncoder(model_name, num_labels=1, max_length=512, default_activation_function=torch.nn.Identity()) ### Now we read the MS Marco dataset data_folder = 'msmarco-data' os.makedirs(data_folder, exist_ok=True) #### Read the corpus files, that contain all the passages. Store them in the corpus dict corpus = {} collection_filepath = os.path.join(data_folder, 'collection.tsv') if not os.path.exists(collection_filepath): tar_filepath = os.path.join(data_folder, 'collection.tar.gz') if not os.path.exists(tar_filepath): logging.info("Download collection.tar.gz") util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz', tar_filepath)
""" This example computes the score between a query and all possible sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS). It output then the most similar sentences for the given query. """ from sentence_transformers.cross_encoder import CrossEncoder import numpy as np # Pre-trained cross encoder model = CrossEncoder('cross-encoder/distilroberta-base-stsb') # We want to compute the similarity between the query sentence query = 'A man is eating pasta.' # With all sentences in the corpus corpus = [ 'A man is eating food.', 'A man is eating a piece of bread.', 'The girl is carrying a baby.', 'A man is riding a horse.', 'A woman is playing violin.', 'Two men pushed carts through the woods.', 'A man is riding a white horse on an enclosed ground.', 'A monkey is playing drums.', 'A cheetah is running behind its prey.' ] # So we create the respective sentence combinations sentence_combinations = [[query, corpus_sentence] for corpus_sentence in corpus] # Compute the similarity scores for these combinations similarity_scores = model.predict(sentence_combinations) # Sort the scores in decreasing order
label_id = int(row['label']) train_samples.append( InputExample(texts=[row['premise'], row['hypothesis']], label=label_id)) train_batch_size = 16 num_epochs = 10 model_save_path = 'output/training_allnli-' + datetime.now().strftime( "%Y-%m-%d_%H-%M-%S") # Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels # model = CrossEncoder('sentence-transformers/distilbert-base-nli-stsb-mean-tokens', num_labels=len(label2int)) # model = CrossEncoder('sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking', # num_labels=len(label2int)) # model = CrossEncoder('sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens', num_labels=len(label2int)) model = CrossEncoder('joeddav/xlm-roberta-large-xnli', num_labels=len(label2int)) # We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size) # During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set. evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples, name='AllNLI-dev') warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up logging.info("Warmup-steps: {}".format(warmup_steps)) # Train the model
#Check if dataset exsist. If not, download and extract it sts_dataset_path = 'datasets/stsbenchmark.tsv.gz' if not os.path.exists(sts_dataset_path): util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) cross_encoder_path = 'output/cross-encoder/stsb_indomain_' + model_name.replace( "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") bi_encoder_path = 'output/bi-encoder/stsb_augsbert_BM25_' + model_name.replace( "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S") ###### Cross-encoder (simpletransformers) ###### logging.info("Loading sentence-transformers model: {}".format(model_name)) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for cross-encoder model cross_encoder = CrossEncoder(model_name, num_labels=1) ###### Bi-encoder (sentence-transformers) ###### logging.info("Loading bi-encoder model: {}".format(model_name)) # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model]) ##################################################################### #
class rerankPassages: def __init__(self, nlp): self.bm25_ranking = bm25(nlp) self.tfidf_ranking = tfidf(nlp) self.sbert_ranking = sbert() self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-6") self.kg = KnowledgeGraph('chatbot', 'password') self.document = None def fit(self, document): self.document = document self.bm25_ranking.fit(document) self.tfidf_ranking.preprocessDocument(document) self.sbert_ranking.fit(document) def matchParaSent(self, s, p): sList = s.split() if len(sList) < 1: return False count = 0 for i in sList: if i in p:count += 1 if count/len(sList) > 0.9: return True else: return False def getSentences(self, query, n): return self.kg.retrieveSentences(query, n) def withKg(self, query, paras, t): sentences = self.kg.retrieveSentences(query, 10) for i in paras: avgScore = 0 sentencesMatched = 0 for s in sentences: sentence = s['sentence'] score = s['score'] if self.matchParaSent(sentence, i[0]): if sentence not in i[0]: print(sentence, i[0]) # print(sentence, i[0]) sentencesMatched += 1 avgScore += score # if sentencesMatched == 0: sentencesMatched = 1 i[1] = 1/(t + i[1]) + 1/(t + sentencesMatched) paras.sort(key = lambda x : x[1]) return [i[0] for i in paras] def withCrossEncoder(self, query, paras): para_combination = [[query, p] for p in paras] score = self.cross_encoder.predict(para_combination) sim_scores_argsort = reversed(np.argsort(score)) reranked_passages = list() for idx in sim_scores_argsort: reranked_passages.append(paras[idx]) return reranked_passages def rankDocuments(self, query, mu, k): bm25_scores = self.bm25_ranking.rankDocuments(query) tfidf_scores = self.tfidf_ranking.rankDocuments(query) sbert_scores = self.sbert_ranking.rankDocuments(query) #Combined scoring # mu = 0.7 # k = 10 rrf = mu*sbert_scores + (1-mu)*tfidf_scores # rrf = 1/(k+c) + 1/(k + bm25_scores) # print(rrf) # print(np.shape(rrf)) #retrive top k passages scores = rrf.tolist() score_passage = [(s,i) for i, s in enumerate(scores[0])] score_passage.sort(reverse = True) # return self.withKg(query, [[self.document[i[1]], i[0]] for i in score_passage[:4]], k) return self.withCrossEncoder(query, [self.document[i[1]] for i in score_passage[:5]])