Ejemplo n.º 1
0
	def __init__(self, nlp):
		self.bm25_ranking = bm25(nlp)
		self.tfidf_ranking = tfidf(nlp)
		self.sbert_ranking = sbert()
		self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-6")
		self.kg = KnowledgeGraph('chatbot', 'password')
		self.document = None
Ejemplo n.º 2
0
def construct_model(base_model, encoder_style):
    # word_embedding_model = models.Transformer(base_model, max_seq_length=256)
    # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    # model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    if encoder_style == BIENCODER:
        model = SentenceTransformer(base_model)
        train_loss = losses.CosineSimilarityLoss(model)
    elif encoder_style == CROSSENCODER:
        model = CrossEncoder(base_model, num_labels=1, max_length=512)
        train_loss = None
    return model, train_loss
Ejemplo n.º 3
0
            dev_samples.append(
                InputExample(texts=[row['sentence_1'], row['sentence_2']],
                             label=clazz))
        elif row['split'] == 'test':
            test_samples.append(
                InputExample(texts=[row['sentence_1'], row['sentence_2']],
                             label=clazz))

train_batch_size = 16
num_epochs = 100
model_name = 'distilroberta-base'  #distilbert-base-uncased
model_save_path = 'output/training_unfolding_structure-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S") + '_' + model_name

#Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels
model = CrossEncoder(model_name, num_labels=1)

#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples,
                              shuffle=False,
                              batch_size=train_batch_size)

evaluator = CEBinaryClassificationEvaluator.from_input_examples(
    dev_samples, name='UnfoldingStructure-dev')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_dataloader=train_dataloader,
Ejemplo n.º 4
0
config = yaml.safe_load(open('config.yml', 'r'))
os.environ["TORCH_HOME"] = config['base_model_dir']

num_labels = 2

logging.info("Processing Data ...")
if config['use_hypernym']:
    train_samples, dev_samples = get_train_dev_data(
        config, os.path.join(config['train_dir'], config['train_hyp_file']))
    num_labels = 3
else:
    train_samples, dev_samples = get_train_dev_data(
        config, os.path.join(config['train_dir'], config['train_flat_file']))
logging.info("Done Processing Data ...")

model = CrossEncoder(config['crossencoder_base_model'], num_labels=num_labels)

batch_size = config['batch_size']
num_epochs = config['num_epochs']

train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=batch_size)
train_loss = get_loss(config['loss_type'], model)
evaluator = CEBinaryAccuracyEvaluator.from_input_examples(dev_samples)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)
logging.info("Warmup-steps: {}".format(warmup_steps))

model_dir = os.path.join(config['saved_model_dir'], config['checkpoint_path'])
#Check if dataset exsist. If not, download and extract  it
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                  sts_dataset_path)

#Define our Cross-Encoder
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark-' + datetime.now().strftime(
    "%Y-%m-%d_%H-%M-%S")

#We use distilroberta-base as base model and set num_labels=1, which predicts a continous score between 0 and 1
model = CrossEncoder('distilroberta-base', num_labels=1)

# Read STSb dataset
logger.info("Read STSbenchmark train dataset")

train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1

        if row['split'] == 'dev':
            dev_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
#First, we define the transformer model we want to fine-tune
model_name = 'google/electra-small-discriminator'
train_batch_size = 32
num_epochs = 1
model_save_path = 'output/training_ms-marco_cross-encoder-' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# We train the network with as a binary label task
# Given [query, passage] is the label 0 = irrelevant or 1 = relevant?
# We use a positive-to-negative ratio: For 1 positive sample (label 1) we include 4 negative samples (label 0)
# in our training setup. For the negative samples, we use the triplets provided by MS Marco that
# specify (query, positive sample, negative sample).
pos_neg_ration = 4

#We set num_labels=1, which predicts a continous score between 0 and 1
model = CrossEncoder(model_name, num_labels=1, max_length=512)

### Now we read the MS Marco dataset
data_folder = 'msmarco-data'
os.makedirs(data_folder, exist_ok=True)

#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {}
collection_filepath = os.path.join(data_folder, 'collection.tsv')
if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, 'collection.tar.gz')
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
        util.http_get(
            'https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz',
            tar_filepath)
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np

# Pre-trained cross encoder
model = CrossEncoder('sentence-transformers/ce-distilroberta-base-stsb')

# We want to compute the similarity between the query sentence
query = 'A man is eating pasta.'

# With all sentences in the corpus
corpus = [
    'A man is eating food.', 'A man is eating a piece of bread.',
    'The girl is carrying a baby.', 'A man is riding a horse.',
    'A woman is playing violin.', 'Two men pushed carts through the woods.',
    'A man is riding a white horse on an enclosed ground.',
    'A monkey is playing drums.', 'A cheetah is running behind its prey.'
]

# So we create the respective sentence combinations
sentence_combinations = [[query, corpus_sentence]
                         for corpus_sentence in corpus]

# Compute the similarity scores for these combinations
similarity_scores = model.predict(sentence_combinations)

# Sort the scores in decreasing order
                    handlers=[LoggingHandler()])

config = yaml.safe_load(open('config.yml', 'r'))

num_labels = 2
if config['use_hyperynm']:
    num_labels = 3

test_sets = config['test_files']
batch_size = config['batch_size']
results_dir = config['results_dir']
eval_dir = config['eval_dir']

logging.info("Loading Model ...")
model = CrossEncoder(os.path.join(config['saved_model_dir'],
                                  config['eval_base']),
                     num_labels=num_labels)
logging.info("Done Loading Model ...")

for test_set in test_sets:
    test_name = test_set.split('.')[0]
    logging.info("Reading " + test_name + " Data")
    test_data, all_sentences, all_definitions = get_test_data(
        os.path.join(eval_dir, test_set), True)
    logging.info("Computing and Writing " + test_name + " Scores")
    scores = get_crossencoder_scores(all_sentences, all_definitions,
                                     batch_size, model)
    populate_scores(test_data, scores)
    scores_dict = compute_test_metrics(test_data, False)
    out_dir = os.path.join(results_dir, config['eval_base'])
    if not os.path.exists(out_dir):
Ejemplo n.º 9
0
    def __init__(self,
                 file_path,
                 pilot,
                 service,
                 use_cuda=False,
                 cuda_device=-1,
                 annotation_model=None,
                 section_split_model=None):
        ''' PathwayGenerator object constructor

        Args:
            path (str): path of the file from which the pathway is generated.
            pilot (str): name of the pilot.
            service (str): name of the service considered.
            use_cuda (bool): flag to use gpu model.
            cuda_device (int, optional): Id of the gpu device to use. Defaults to -1.
        '''

        assert file_path is not None, "A file path is required"

        languages = {
            'Larissa': 'el',
            'Birmingham': 'en',
            'Malaga': 'es',
            'Palermo': 'it'
        }

        self.path = file_path
        if os.path.splitext(self.path)[-1] == '.txt':
            self.converted_file = doc2txt.purge_urls(
                open(self.path, 'r').read(),
                os.path.splitext(self.path)[0])
        self.use_cuda = use_cuda
        self.cuda_device = cuda_device
        self.language = languages[pilot]
        # TODO: language detection param?
        if len(annotation_model) != 2:
            self.annotation_model = Transner(
                pretrained_model=annotation_model,
                use_cuda=use_cuda,
                cuda_device=cuda_device,
                language_detection=True,
                threshold=0.85,
                args={"use_multiprocessing": False})
        else:
            self.annotation_model = Transner(
                pretrained_model='bert_uncased_' + annotation_model,
                use_cuda=use_cuda,
                cuda_device=cuda_device,
                language_detection=True,
                threshold=0.85,
                args={"use_multiprocessing": False})

        self.section_split_model = CrossEncoder(section_split_model,
                                                num_labels=1)

        self.annotation_metadata = metadata = pilot + ' - ' + service + ' - ' + os.path.basename(
            self.path)
        #self.generation_metadata = {
        #    'where': pilot + ' - ' + service + ' - ' + 'Where - ' + os.path.basename(self.path) + ' - ',
        #    'when': pilot + ' - ' + service + ' - ' + 'When - ' + os.path.basename(self.path) + ' - ',
        #    'how': pilot + ' - ' + service + ' - ' + 'How - ' + os.path.basename(self.path) + ' - '
        #}

        self.generation_metadata = pilot + ' - ' + service + ' - ' + os.path.basename(
            self.path) + ' - '
                                                       depth=depth,
                                                       batch_size=batch_size)
    return all_scores, psg_indices


if __name__ == "__main__":
    args = parser.parse_args()

    all_scores, psg_indices = do_first_stage_retrieval(
        args.biencoder_query_reps,
        args.biencoder_passage_reps,
        depth=args.first_stage_depth,
        batch_size=args.first_stage_batch_size)

    model = CrossEncoder(args.crossencoder_model_directory,
                         num_labels=1,
                         max_length=512)

    query_texts = []
    for row in jsonlines.open(args.test_queries):
        query_texts.append(row["text"])

    dataset_texts, dataset_ids = read_dataset_collection(
        args.search_collection)

    all_query_dataset_pairs = []
    for i, query in enumerate(query_texts):
        first_stage_doc_idxs = [int(docidx) for docidx in psg_indices[i]]
        first_stage_dataset_texts = [
            dataset_texts[docidx] for docidx in first_stage_doc_idxs
        ]
Ejemplo n.º 11
0
class PathwayGenerator():
    def __init__(self,
                 file_path,
                 pilot,
                 service,
                 use_cuda=False,
                 cuda_device=-1,
                 annotation_model=None,
                 section_split_model=None):
        ''' PathwayGenerator object constructor

        Args:
            path (str): path of the file from which the pathway is generated.
            pilot (str): name of the pilot.
            service (str): name of the service considered.
            use_cuda (bool): flag to use gpu model.
            cuda_device (int, optional): Id of the gpu device to use. Defaults to -1.
        '''

        assert file_path is not None, "A file path is required"

        languages = {
            'Larissa': 'el',
            'Birmingham': 'en',
            'Malaga': 'es',
            'Palermo': 'it'
        }

        self.path = file_path
        if os.path.splitext(self.path)[-1] == '.txt':
            self.converted_file = doc2txt.purge_urls(
                open(self.path, 'r').read(),
                os.path.splitext(self.path)[0])
        self.use_cuda = use_cuda
        self.cuda_device = cuda_device
        self.language = languages[pilot]
        # TODO: language detection param?
        if len(annotation_model) != 2:
            self.annotation_model = Transner(
                pretrained_model=annotation_model,
                use_cuda=use_cuda,
                cuda_device=cuda_device,
                language_detection=True,
                threshold=0.85,
                args={"use_multiprocessing": False})
        else:
            self.annotation_model = Transner(
                pretrained_model='bert_uncased_' + annotation_model,
                use_cuda=use_cuda,
                cuda_device=cuda_device,
                language_detection=True,
                threshold=0.85,
                args={"use_multiprocessing": False})

        self.section_split_model = CrossEncoder(section_split_model,
                                                num_labels=1)

        self.annotation_metadata = metadata = pilot + ' - ' + service + ' - ' + os.path.basename(
            self.path)
        #self.generation_metadata = {
        #    'where': pilot + ' - ' + service + ' - ' + 'Where - ' + os.path.basename(self.path) + ' - ',
        #    'when': pilot + ' - ' + service + ' - ' + 'When - ' + os.path.basename(self.path) + ' - ',
        #    'how': pilot + ' - ' + service + ' - ' + 'How - ' + os.path.basename(self.path) + ' - '
        #}

        self.generation_metadata = pilot + ' - ' + service + ' - ' + os.path.basename(
            self.path) + ' - '

    def to_list(self):
        element_list = []  # Make an empty list

        for element in re.split('\n', self.converted_file):
            stripped_element = element.strip()
            if stripped_element != '':
                element_list.append(
                    stripped_element)  #Append to list the striped element

        return element_list

    def do_convert(self):
        self.converted_file = doc2txt.convert_to_txt(self.path)
        return self.converted_file

    def do_split(self, threshold=0.5):
        sentence_list = self.to_list()

        scores = []
        for i in range(0, len(sentence_list) - 1):
            current_sentence = sentence_list[i]
            next_sentence = sentence_list[i + 1]

            score = self.section_split_model.predict(
                [current_sentence, next_sentence])
            scores.append(score)

        sections = [
        ]  # sections = [['section1'], ['section2'], ... , ['sectionN']]
        section_text = []
        section_text.append(sentence_list[0])
        for i in range(0, len(scores)):
            if scores[i] >= threshold:
                section_text.append(sentence_list[i + 1])
            else:
                sections.append(section_text)
                section_text = []
                section_text.append(sentence_list[i + 1])
        sections.append(section_text)

        return sections

    def do_annotate(self, sentence_list):
        self.ner_dict = self.annotation_model.ner(sentence_list,
                                                  apply_regex=True)
        if self.language in ['es', 'en']:
            self.ner_dict = self.annotate_sutime(self.ner_dict)
        else:
            self.ner_dict = self.annotation_model.find_dates(self.ner_dict)

        self.ner_dict = annotator.aggregate_dict(self.ner_dict)

        self.ner_dict['entities'] = sorted(self.ner_dict['entities'],
                                           key=lambda ner: ner['start_offset'])

        self.ner_dict = annotator.resolve_uri_entities(self.ner_dict,
                                                       self.path)

        return self.ner_dict

    def do_generate(self):
        if os.path.splitext(self.path)[-1] == '.json':
            self.ner_dict = json.load(open(self.path, 'r'))
        aggregated_ner_dict = aggregator.aggregate_entities(self.ner_dict)
        print(aggregated_ner_dict)
        #aggregated_ner_dict = self.ner_dict = {'text': 'test 1 of the section 1.\ntest 2 of the section 1.\ntest 3 of the section 1.\n', 'entities': {'LOCATION': [{'value': 'test', 'confidence': 0.9737, 'start_offset': 0, 'end_offset': 4}], 'ORGANIZATION': [{'value': 'test', 'confidence': 0.9676, 'start_offset': 25, 'end_offset': 29}], 'TIME': [{'value': 'test', 'confidence': 0.9573, 'start_offset': 50, 'end_offset': 54}]}}
        json_pathway = generator.generate(aggregated_ner_dict)
        mapped_entities = json.loads(json_pathway)

        dict_pathway = json.load(open("tools/dict_pathway.json", 'r'))

        self.pathway = {}

        #{'physical_office': [{'start', 'end'}...]}
        for key, sub_types in dict_pathway.items():
            self.pathway[key] = {}
            for sub_type in sub_types:
                self.pathway[key][sub_type] = []

        for entity in mapped_entities:
            self.pathway[self.keys_of_value(
                dict_pathway, entity['step'])][entity['step']].append(entity)

        # {'dove': [], 'come': [], 'quando': []}

        #todo: remove return because we can read the value in the pgr object
        return self.pathway

    def export_annotation_to_doccano(self, add_confidence=False):
        filename = os.path.splitext(self.path)[0]

        doccano_dict = {}
        doccano_dict['text'] = self.ner_dict['text']
        doccano_dict['labels'] = []

        doccano_dict['meta'] = self.annotation_metadata

        for item in self.ner_dict['entities']:
            if add_confidence:
                doccano_dict['labels'].append([
                    item['start_offset'], item['end_offset'], item['type'],
                    item['confidence']
                ])
            else:
                doccano_dict['labels'].append(
                    [item['start_offset'], item['end_offset'], item['type']])

        file_out = open(filename + '_ner.jsonl', 'w', encoding='utf-8')
        file_out.write(json.dumps(doccano_dict))
        file_out.write('\n')

        return doccano_dict, filename + '_ner.jsonl'

    def export_generation_to_doccano(self, pathway=None):
        dict_translations = json.load(open("tools/dict_translations.json",
                                           'r'))

        filename = os.path.splitext(self.path)[0]
        pathway_jsonl = []

        for key in pathway:
            tmp_dict = {"text": '', "labels": [], "meta": ''}
            tmp_dict["text"] = key

            for step, step_dict in pathway[key].items():
                tmp_dict["meta"] = self.generation_metadata + key
                for sub_type, entities in step_dict.items():
                    label = dict_translations[
                        self.language][step] + ' - ' + dict_translations[
                            self.language][sub_type] + ': '
                    if len(entities) == 0:
                        label = label + '-'
                        tmp_dict['labels'].append(label)
                    else:
                        for entity in entities:
                            label = label + entity['entity'].strip() + ' , '

                        tmp_dict['labels'].append(label[:-2].strip())

            pathway_jsonl.append(tmp_dict)

        file_out = open(filename + '_pathway.jsonl', 'w', encoding='utf-8')

        return_string = ''

        for element in pathway_jsonl:
            string_element = str(json.dumps(element, ensure_ascii=False))
            file_out.write(string_element)
            file_out.write('\n')

            return_string = return_string + string_element + '\n'

        return return_string, filename + '_pathway.jsonl'

    def keys_of_value(self, dct, value):
        for k in dct:
            if isinstance(dct[k], list):
                if value in dct[k]:
                    return k
            else:
                if value == dct[k]:
                    return k

    def annotate_sutime(self, ner_dict):
        for item in ner_dict:
            text = item['sentence']
            jar_files = os.path.join('python-sutime/', 'jars')
            sutime = sutime_mod.SUTime(jars=jar_files, mark_time_ranges=True)

            json = sutime.parse(text)

            time_type = self.annotation_model.check_opening_time(
                item['entities'])

            for item_sutime in json:
                if not self.annotation_model.find_overlap(
                        item['entities'], item_sutime['start'],
                        item_sutime['end']):
                    item['entities'].append({
                        'type': time_type,
                        'value': item_sutime['text'],
                        'confidence': 0.85,
                        'offset': item_sutime['start']
                    })

        return ner_dict

    def sections_to_doccano(self, sections):
        count, step = 0, 1
        doccano_dict = {'text': '', 'labels': []}

        for section in sections:
            initial_count, final_count = count, 0

            for sentence in section:
                doccano_dict['text'] = doccano_dict['text'] + sentence + '.\n'
                final_count = final_count + len(sentence) + 2

            doccano_dict['labels'].append([
                initial_count, initial_count + final_count - 1,
                'Step' + str(step)
            ])
            step = step + 1
            count = initial_count + final_count

        return doccano_dict
Ejemplo n.º 12
0
                                   SentenceTransformer, util)
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sklearn.metrics import balanced_accuracy_score
from torch.utils.data import DataLoader

pd.set_option("display.max_rows", 200)
es_host = 'localhost:9200'

bi_model_path = os.path.join(os.path.dirname("__file__"), os.path.pardir,
                             "bi_encoder_save/")
bi_model = SentenceTransformer(bi_model_path, device="cpu")

cross_model_path = "output/training_ms-marco_cross-encoder-xlm-roberta-base-2021-01-17_14-43-23_map-train-eval"
cross_model = CrossEncoder(cross_model_path,
                           num_labels=1,
                           max_length=512,
                           device="cpu")


class es_pandas_edit(es_pandas):
    @staticmethod
    def serialize(row, columns, use_pandas_json, iso_dates):
        if use_pandas_json:
            return json.dumps(dict(zip(columns, row)), iso_dates=iso_dates)
        return dict(
            zip(columns, [
                None if (all(pd.isna(r)) if
                         (hasattr(r, "__len__")
                          and type(r) != type("")) else pd.isna(r)) else r
                for r in row
            ]))
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout


#First, we define the transformer model we want to fine-tune
model_name = 'microsoft/MiniLM-L12-H384-uncased'
train_batch_size = 32
num_epochs = 1
model_save_path = 'output/training_ms-marco_cross-encoder-v2-'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



#We set num_labels=1 and set the activation function to Identiy, so that we get the raw logits
model = CrossEncoder(model_name, num_labels=1, max_length=512, default_activation_function=torch.nn.Identity())


### Now we read the MS Marco dataset
data_folder = 'msmarco-data'
os.makedirs(data_folder, exist_ok=True)


#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {}
collection_filepath = os.path.join(data_folder, 'collection.tsv')
if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, 'collection.tar.gz')
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
        util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz', tar_filepath)
Ejemplo n.º 14
0
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np

# Pre-trained cross encoder
model = CrossEncoder('cross-encoder/distilroberta-base-stsb')

# We want to compute the similarity between the query sentence
query = 'A man is eating pasta.'

# With all sentences in the corpus
corpus = [
    'A man is eating food.', 'A man is eating a piece of bread.',
    'The girl is carrying a baby.', 'A man is riding a horse.',
    'A woman is playing violin.', 'Two men pushed carts through the woods.',
    'A man is riding a white horse on an enclosed ground.',
    'A monkey is playing drums.', 'A cheetah is running behind its prey.'
]

# So we create the respective sentence combinations
sentence_combinations = [[query, corpus_sentence]
                         for corpus_sentence in corpus]

# Compute the similarity scores for these combinations
similarity_scores = model.predict(sentence_combinations)

# Sort the scores in decreasing order
Ejemplo n.º 15
0
    label_id = int(row['label'])
    train_samples.append(
        InputExample(texts=[row['premise'], row['hypothesis']],
                     label=label_id))

train_batch_size = 16
num_epochs = 10
model_save_path = 'output/training_allnli-' + datetime.now().strftime(
    "%Y-%m-%d_%H-%M-%S")

# Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels
# model = CrossEncoder('sentence-transformers/distilbert-base-nli-stsb-mean-tokens', num_labels=len(label2int))
# model = CrossEncoder('sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking',
#                      num_labels=len(label2int))
# model = CrossEncoder('sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens', num_labels=len(label2int))
model = CrossEncoder('joeddav/xlm-roberta-large-xnli',
                     num_labels=len(label2int))

# We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=train_batch_size)

# During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples,
                                                           name='AllNLI-dev')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
Ejemplo n.º 16
0
#Check if dataset exsist. If not, download and extract  it
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                  sts_dataset_path)

cross_encoder_path = 'output/cross-encoder/stsb_indomain_' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
bi_encoder_path = 'output/bi-encoder/stsb_augsbert_BM25_' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

###### Cross-encoder (simpletransformers) ######
logging.info("Loading sentence-transformers model: {}".format(model_name))
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for cross-encoder model
cross_encoder = CrossEncoder(model_name, num_labels=1)

###### Bi-encoder (sentence-transformers) ######
logging.info("Loading bi-encoder model: {}".format(model_name))
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name,
                                          max_seq_length=max_seq_length)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension())

bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

#####################################################################
#
Ejemplo n.º 17
0
class rerankPassages:

	def __init__(self, nlp):
		self.bm25_ranking = bm25(nlp)
		self.tfidf_ranking = tfidf(nlp)
		self.sbert_ranking = sbert()
		self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-6")
		self.kg = KnowledgeGraph('chatbot', 'password')
		self.document = None
	
	def fit(self, document):
		self.document = document
		self.bm25_ranking.fit(document)
		self.tfidf_ranking.preprocessDocument(document)
		self.sbert_ranking.fit(document)
	
	def matchParaSent(self, s, p):
		
		sList = s.split()
		if len(sList) < 1:
			return False
		count = 0
		for i in sList:
			if i in p:count += 1
	
		if count/len(sList) > 0.9: return True
		else: 
			return False

	def getSentences(self, query, n):
		return self.kg.retrieveSentences(query, n)
	def withKg(self, query, paras, t):
			sentences = self.kg.retrieveSentences(query, 10)

			for i in paras:
				avgScore = 0
				sentencesMatched = 0
				for s in sentences:
					sentence = s['sentence']
					score = s['score']
					if self.matchParaSent(sentence, i[0]):
						if sentence not in i[0]: print(sentence, i[0])
						# print(sentence, i[0])
						sentencesMatched += 1
						avgScore += score
				# if sentencesMatched == 0: sentencesMatched = 1
				i[1] = 1/(t + i[1]) + 1/(t + sentencesMatched)

			paras.sort(key = lambda x : x[1])
			return [i[0] for i in paras]

	def withCrossEncoder(self, query, paras):
		para_combination = [[query, p] for p in paras]

		score = self.cross_encoder.predict(para_combination)
		sim_scores_argsort = reversed(np.argsort(score))
		
		reranked_passages = list()
		for idx in sim_scores_argsort:
			reranked_passages.append(paras[idx])
		return reranked_passages

	def rankDocuments(self, query, mu, k):
		bm25_scores = self.bm25_ranking.rankDocuments(query)
		tfidf_scores = self.tfidf_ranking.rankDocuments(query)
		sbert_scores = self.sbert_ranking.rankDocuments(query)
		#Combined scoring
		# mu = 0.7
		# k = 10
		rrf = mu*sbert_scores + (1-mu)*tfidf_scores
		# rrf = 1/(k+c) + 1/(k + bm25_scores)
		# print(rrf)
		# print(np.shape(rrf))
		#retrive top k passages
		scores = rrf.tolist()
		score_passage = [(s,i) for i, s in enumerate(scores[0])]
		score_passage.sort(reverse = True)
		# return self.withKg(query, [[self.document[i[1]], i[0]] for i in score_passage[:4]], k)
		return self.withCrossEncoder(query, [self.document[i[1]] for i in score_passage[:5]])