Beispiel #1
0
	def __init__(self, nlp):
		self.bm25_ranking = bm25(nlp)
		self.tfidf_ranking = tfidf(nlp)
		self.sbert_ranking = sbert()
		self.cross_encoder = CrossEncoder("cross-encoder/ms-marco-TinyBERT-L-6")
		self.kg = KnowledgeGraph('chatbot', 'password')
		self.document = None
def construct_model(base_model, encoder_style):
    # word_embedding_model = models.Transformer(base_model, max_seq_length=256)
    # pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    # model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    if encoder_style == BIENCODER:
        model = SentenceTransformer(base_model)
        train_loss = losses.CosineSimilarityLoss(model)
    elif encoder_style == CROSSENCODER:
        model = CrossEncoder(base_model, num_labels=1, max_length=512)
        train_loss = None
    return model, train_loss
Beispiel #3
0
            dev_samples.append(
                InputExample(texts=[row['sentence_1'], row['sentence_2']],
                             label=clazz))
        elif row['split'] == 'test':
            test_samples.append(
                InputExample(texts=[row['sentence_1'], row['sentence_2']],
                             label=clazz))

train_batch_size = 16
num_epochs = 100
model_name = 'distilroberta-base'  #distilbert-base-uncased
model_save_path = 'output/training_unfolding_structure-' + datetime.now(
).strftime("%Y-%m-%d_%H-%M-%S") + '_' + model_name

#Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels
model = CrossEncoder(model_name, num_labels=1)

#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples,
                              shuffle=False,
                              batch_size=train_batch_size)

evaluator = CEBinaryClassificationEvaluator.from_input_examples(
    dev_samples, name='UnfoldingStructure-dev')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_dataloader=train_dataloader,
Beispiel #4
0
config = yaml.safe_load(open('config.yml', 'r'))
os.environ["TORCH_HOME"] = config['base_model_dir']

num_labels = 2

logging.info("Processing Data ...")
if config['use_hypernym']:
    train_samples, dev_samples = get_train_dev_data(
        config, os.path.join(config['train_dir'], config['train_hyp_file']))
    num_labels = 3
else:
    train_samples, dev_samples = get_train_dev_data(
        config, os.path.join(config['train_dir'], config['train_flat_file']))
logging.info("Done Processing Data ...")

model = CrossEncoder(config['crossencoder_base_model'], num_labels=num_labels)

batch_size = config['batch_size']
num_epochs = config['num_epochs']

train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=batch_size)
train_loss = get_loss(config['loss_type'], model)
evaluator = CEBinaryAccuracyEvaluator.from_input_examples(dev_samples)

warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)
logging.info("Warmup-steps: {}".format(warmup_steps))

model_dir = os.path.join(config['saved_model_dir'], config['checkpoint_path'])
#Check if dataset exsist. If not, download and extract  it
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                  sts_dataset_path)

#Define our Cross-Encoder
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_stsbenchmark-' + datetime.now().strftime(
    "%Y-%m-%d_%H-%M-%S")

#We use distilroberta-base as base model and set num_labels=1, which predicts a continous score between 0 and 1
model = CrossEncoder('distilroberta-base', num_labels=1)

# Read STSb dataset
logger.info("Read STSbenchmark train dataset")

train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1

        if row['split'] == 'dev':
            dev_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
#First, we define the transformer model we want to fine-tune
model_name = 'google/electra-small-discriminator'
train_batch_size = 32
num_epochs = 1
model_save_path = 'output/training_ms-marco_cross-encoder-' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# We train the network with as a binary label task
# Given [query, passage] is the label 0 = irrelevant or 1 = relevant?
# We use a positive-to-negative ratio: For 1 positive sample (label 1) we include 4 negative samples (label 0)
# in our training setup. For the negative samples, we use the triplets provided by MS Marco that
# specify (query, positive sample, negative sample).
pos_neg_ration = 4

#We set num_labels=1, which predicts a continous score between 0 and 1
model = CrossEncoder(model_name, num_labels=1, max_length=512)

### Now we read the MS Marco dataset
data_folder = 'msmarco-data'
os.makedirs(data_folder, exist_ok=True)

#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {}
collection_filepath = os.path.join(data_folder, 'collection.tsv')
if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, 'collection.tar.gz')
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
        util.http_get(
            'https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz',
            tar_filepath)
Beispiel #7
0
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np

# Pre-trained cross encoder
model = CrossEncoder('cross-encoder/stsb-distilroberta-base')

# We want to compute the similarity between the query sentence
query = 'A man is eating pasta.'

# With all sentences in the corpus
corpus = [
    'A man is eating food.', 'A man is eating a piece of bread.',
    'The girl is carrying a baby.', 'A man is riding a horse.',
    'A woman is playing violin.', 'Two men pushed carts through the woods.',
    'A man is riding a white horse on an enclosed ground.',
    'A monkey is playing drums.', 'A cheetah is running behind its prey.'
]

# So we create the respective sentence combinations
sentence_combinations = [[query, corpus_sentence]
                         for corpus_sentence in corpus]

# Compute the similarity scores for these combinations
similarity_scores = model.predict(sentence_combinations)

# Sort the scores in decreasing order
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout


#First, we define the transformer model we want to fine-tune
model_name = 'microsoft/MiniLM-L12-H384-uncased'
train_batch_size = 32
num_epochs = 1
model_save_path = 'output/training_ms-marco_cross-encoder-v2-'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")



#We set num_labels=1 and set the activation function to Identiy, so that we get the raw logits
model = CrossEncoder(model_name, num_labels=1, max_length=512, default_activation_function=torch.nn.Identity())


### Now we read the MS Marco dataset
data_folder = 'msmarco-data'
os.makedirs(data_folder, exist_ok=True)


#### Read the corpus files, that contain all the passages. Store them in the corpus dict
corpus = {}
collection_filepath = os.path.join(data_folder, 'collection.tsv')
if not os.path.exists(collection_filepath):
    tar_filepath = os.path.join(data_folder, 'collection.tar.gz')
    if not os.path.exists(tar_filepath):
        logging.info("Download collection.tar.gz")
        util.http_get('https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz', tar_filepath)
        if row['split'] == 'train':
            train_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=label_id))
        else:
            dev_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=label_id))

train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_allnli-' + datetime.now().strftime(
    "%Y-%m-%d_%H-%M-%S")

#Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels
model = CrossEncoder('distilroberta-base', num_labels=len(label2int))

#We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=train_batch_size)

#During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples,
                                                           name='AllNLI-dev')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  #10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np

# Pre-trained cross encoder
model = CrossEncoder('sentence-transformers/ce-distilroberta-base-stsb')

# We want to compute the similarity between the query sentence
query = 'A man is eating pasta.'

# With all sentences in the corpus
corpus = [
    'A man is eating food.', 'A man is eating a piece of bread.',
    'The girl is carrying a baby.', 'A man is riding a horse.',
    'A woman is playing violin.', 'Two men pushed carts through the woods.',
    'A man is riding a white horse on an enclosed ground.',
    'A monkey is playing drums.', 'A cheetah is running behind its prey.'
]

# So we create the respective sentence combinations
sentence_combinations = [[query, corpus_sentence]
                         for corpus_sentence in corpus]

# Compute the similarity scores for these combinations
similarity_scores = model.predict(sentence_combinations)

# Sort the scores in decreasing order
Beispiel #11
0
    label_id = int(row['label'])
    train_samples.append(
        InputExample(texts=[row['premise'], row['hypothesis']],
                     label=label_id))

train_batch_size = 16
num_epochs = 10
model_save_path = 'output/training_allnli-' + datetime.now().strftime(
    "%Y-%m-%d_%H-%M-%S")

# Define our CrossEncoder model. We use distilroberta-base as basis and setup it up to predict 3 labels
# model = CrossEncoder('sentence-transformers/distilbert-base-nli-stsb-mean-tokens', num_labels=len(label2int))
# model = CrossEncoder('sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking',
#                      num_labels=len(label2int))
# model = CrossEncoder('sentence-transformers/xlm-r-100langs-bert-base-nli-mean-tokens', num_labels=len(label2int))
model = CrossEncoder('joeddav/xlm-roberta-large-xnli',
                     num_labels=len(label2int))

# We wrap train_samples, which is a list ot InputExample, in a pytorch DataLoader
train_dataloader = DataLoader(train_samples,
                              shuffle=True,
                              batch_size=train_batch_size)

# During training, we use CESoftmaxAccuracyEvaluator to measure the accuracy on the dev set.
evaluator = CESoftmaxAccuracyEvaluator.from_input_examples(dev_samples,
                                                           name='AllNLI-dev')

warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  # 10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
                    handlers=[LoggingHandler()])

config = yaml.safe_load(open('config.yml', 'r'))

num_labels = 2
if config['use_hyperynm']:
    num_labels = 3

test_sets = config['test_files']
batch_size = config['batch_size']
results_dir = config['results_dir']
eval_dir = config['eval_dir']

logging.info("Loading Model ...")
model = CrossEncoder(os.path.join(config['saved_model_dir'],
                                  config['eval_base']),
                     num_labels=num_labels)
logging.info("Done Loading Model ...")

for test_set in test_sets:
    test_name = test_set.split('.')[0]
    logging.info("Reading " + test_name + " Data")
    test_data, all_sentences, all_definitions = get_test_data(
        os.path.join(eval_dir, test_set), True)
    logging.info("Computing and Writing " + test_name + " Scores")
    scores = get_crossencoder_scores(all_sentences, all_definitions,
                                     batch_size, model)
    populate_scores(test_data, scores)
    scores_dict = compute_test_metrics(test_data, False)
    out_dir = os.path.join(results_dir, config['eval_base'])
    if not os.path.exists(out_dir):
                                                       depth=depth,
                                                       batch_size=batch_size)
    return all_scores, psg_indices


if __name__ == "__main__":
    args = parser.parse_args()

    all_scores, psg_indices = do_first_stage_retrieval(
        args.biencoder_query_reps,
        args.biencoder_passage_reps,
        depth=args.first_stage_depth,
        batch_size=args.first_stage_batch_size)

    model = CrossEncoder(args.crossencoder_model_directory,
                         num_labels=1,
                         max_length=512)

    query_texts = []
    for row in jsonlines.open(args.test_queries):
        query_texts.append(row["text"])

    dataset_texts, dataset_ids = read_dataset_collection(
        args.search_collection)

    all_query_dataset_pairs = []
    for i, query in enumerate(query_texts):
        first_stage_doc_idxs = [int(docidx) for docidx in psg_indices[i]]
        first_stage_dataset_texts = [
            dataset_texts[docidx] for docidx in first_stage_doc_idxs
        ]
Beispiel #14
0
    def __init__(self,
                 file_path,
                 pilot,
                 service,
                 use_cuda=False,
                 cuda_device=-1,
                 annotation_model=None,
                 section_split_model=None):
        ''' PathwayGenerator object constructor

        Args:
            path (str): path of the file from which the pathway is generated.
            pilot (str): name of the pilot.
            service (str): name of the service considered.
            use_cuda (bool): flag to use gpu model.
            cuda_device (int, optional): Id of the gpu device to use. Defaults to -1.
        '''

        assert file_path is not None, "A file path is required"

        languages = {
            'Larissa': 'el',
            'Birmingham': 'en',
            'Malaga': 'es',
            'Palermo': 'it'
        }

        self.path = file_path
        if os.path.splitext(self.path)[-1] == '.txt':
            self.converted_file = doc2txt.purge_urls(
                open(self.path, 'r').read(),
                os.path.splitext(self.path)[0])
        self.use_cuda = use_cuda
        self.cuda_device = cuda_device
        self.language = languages[pilot]
        # TODO: language detection param?
        if len(annotation_model) != 2:
            self.annotation_model = Transner(
                pretrained_model=annotation_model,
                use_cuda=use_cuda,
                cuda_device=cuda_device,
                language_detection=True,
                threshold=0.85,
                args={"use_multiprocessing": False})
        else:
            self.annotation_model = Transner(
                pretrained_model='bert_uncased_' + annotation_model,
                use_cuda=use_cuda,
                cuda_device=cuda_device,
                language_detection=True,
                threshold=0.85,
                args={"use_multiprocessing": False})

        self.section_split_model = CrossEncoder(section_split_model,
                                                num_labels=1)

        self.annotation_metadata = metadata = pilot + ' - ' + service + ' - ' + os.path.basename(
            self.path)
        #self.generation_metadata = {
        #    'where': pilot + ' - ' + service + ' - ' + 'Where - ' + os.path.basename(self.path) + ' - ',
        #    'when': pilot + ' - ' + service + ' - ' + 'When - ' + os.path.basename(self.path) + ' - ',
        #    'how': pilot + ' - ' + service + ' - ' + 'How - ' + os.path.basename(self.path) + ' - '
        #}

        self.generation_metadata = pilot + ' - ' + service + ' - ' + os.path.basename(
            self.path) + ' - '
Beispiel #15
0
                                   SentenceTransformer, util)
from sentence_transformers.cross_encoder import CrossEncoder
from sentence_transformers.cross_encoder.evaluation import CERerankingEvaluator
from sklearn.metrics import balanced_accuracy_score
from torch.utils.data import DataLoader

pd.set_option("display.max_rows", 200)
es_host = 'localhost:9200'

bi_model_path = os.path.join(os.path.dirname("__file__"), os.path.pardir,
                             "bi_encoder_save/")
bi_model = SentenceTransformer(bi_model_path, device="cpu")

cross_model_path = "output/training_ms-marco_cross-encoder-xlm-roberta-base-2021-01-17_14-43-23_map-train-eval"
cross_model = CrossEncoder(cross_model_path,
                           num_labels=1,
                           max_length=512,
                           device="cpu")


class es_pandas_edit(es_pandas):
    @staticmethod
    def serialize(row, columns, use_pandas_json, iso_dates):
        if use_pandas_json:
            return json.dumps(dict(zip(columns, row)), iso_dates=iso_dates)
        return dict(
            zip(columns, [
                None if (all(pd.isna(r)) if
                         (hasattr(r, "__len__")
                          and type(r) != type("")) else pd.isna(r)) else r
                for r in row
            ]))
Beispiel #16
0
"""
This example computes the score between a query and all possible
sentences in a corpus using a Cross-Encoder for semantic textual similarity (STS).
It output then the most similar sentences for the given query.
"""
from sentence_transformers.cross_encoder import CrossEncoder
import numpy as np

# Pre-trained cross encoder
model = CrossEncoder('cross-encoder/distilroberta-base-stsb')

# We want to compute the similarity between the query sentence
query = 'A man is eating pasta.'

# With all sentences in the corpus
corpus = [
    'A man is eating food.', 'A man is eating a piece of bread.',
    'The girl is carrying a baby.', 'A man is riding a horse.',
    'A woman is playing violin.', 'Two men pushed carts through the woods.',
    'A man is riding a white horse on an enclosed ground.',
    'A monkey is playing drums.', 'A cheetah is running behind its prey.'
]

# So we create the respective sentence combinations
sentence_combinations = [[query, corpus_sentence]
                         for corpus_sentence in corpus]

# Compute the similarity scores for these combinations
similarity_scores = model.predict(sentence_combinations)

# Sort the scores in decreasing order
Beispiel #17
0
#Check if dataset exsist. If not, download and extract  it
sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                  sts_dataset_path)

cross_encoder_path = 'output/cross-encoder/stsb_indomain_' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
bi_encoder_path = 'output/bi-encoder/stsb_augsbert_BM25_' + model_name.replace(
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

###### Cross-encoder (simpletransformers) ######
logging.info("Loading sentence-transformers model: {}".format(model_name))
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for cross-encoder model
cross_encoder = CrossEncoder(model_name, num_labels=1)

###### Bi-encoder (sentence-transformers) ######
logging.info("Loading bi-encoder model: {}".format(model_name))
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name,
                                          max_seq_length=max_seq_length)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension())

bi_encoder = SentenceTransformer(modules=[word_embedding_model, pooling_model])

#####################################################################
#
logger.info("Read dev dataset")
dev_samples = []
with open(os.path.join(dataset_path, 'classification', 'dev_pairs.tsv'), 'r', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        dev_samples.append(InputExample(texts=[row['question1'], row['question2']], label=int(row['is_duplicate'])))


#Configuration
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/training_quora-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


#We use distilroberta-base with a single label, i.e., it will output a value between 0 and 1 indicating the similarity of the two questions
model = CrossEncoder('distilroberta-base', num_labels=1)

# We wrap train_samples (which is a List[InputExample]) into a pytorch DataLoader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)


# We add an evaluator, which evaluates the performance during training
evaluator = CEBinaryClassificationEvaluator.from_input_examples(dev_samples, name='Quora-dev')


# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logger.info("Warmup-steps: {}".format(warmup_steps))


# Train the model