for dev_file in dev_files:
    logging.info("Create evaluator for " + dev_file)
    src_sentences = []
    trg_sentences = []
    with gzip.open(dev_file, 'rt', encoding='utf8') as fIn:
        for line in fIn:
            splits = line.strip().split('\t')
            if splits[0] != "" and splits[1] != "":
                src_sentences.append(splits[0])
                trg_sentences.append(splits[1])

    #Mean Squared Error (MSE) measures the (euclidean) distance between teacher and student embeddings
    dev_mse = evaluation.MSEEvaluator(src_sentences,
                                      trg_sentences,
                                      name=os.path.basename(dev_file),
                                      teacher_model=teacher_model,
                                      batch_size=inference_batch_size)
    evaluators.append(dev_mse)

    # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
    dev_trans_acc = evaluation.TranslationEvaluator(
        src_sentences,
        trg_sentences,
        name=os.path.basename(dev_file),
        batch_size=inference_batch_size)
    evaluators.append(dev_trans_acc)

##### Read cross-lingual Semantic Textual Similarity (STS) data ####
all_languages = list(set(list(source_languages) + list(target_languages)))
sts_data = {}
Esempio n. 2
0
logging.info("Read STS2017.en-de dataset")
evaluators = []
sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2)
dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='STS2017.en-de')
evaluators.append(evaluator_sts)


# Use XLNI.en-de dataset with MSE evaluation
logging.info("Read XNLI.en-de dataset")
xnli_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
xnli_reader.load_data('../datasets/xnli-en-de.txt.gz')

xnli_dataloader = DataLoader(xnli_reader, shuffle=False, batch_size=train_batch_size)
xnli_mse = evaluation.MSEEvaluator(xnli_dataloader, name='xnli-en-de')
evaluators.append(xnli_mse)



# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]),
          epochs=20,
          evaluation_steps=1000,
          warmup_steps=10000,
          scheduler='warmupconstant',
          output_path=output_path,
          save_best_model=True,
          optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
          )
    logging.info("Teacher Performance with {} dimensions:".format(teacher_model.get_sentence_embedding_dimension()))
    dev_evaluator_sts(teacher_model)



# We train the student_model such that it creates sentence embeddings similar to the embeddings from the teacher_model
# For this, we need a large set of sentences. These sentences are embedded using the teacher model,
# and the student tries to mimic these embeddings. It is the same approach as used in: https://arxiv.org/abs/2004.09813
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=False)
train_data.add_dataset([[sent] for sent in train_sentences_nli], max_sentence_length=256)
train_data.add_dataset([[sent] for sent in train_sentences_wikipedia], max_sentence_length=256)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)

# We create an evaluator, that measure the Mean Squared Error (MSE) between the teacher and the student embeddings
dev_sentences = dev_sentences_nli + dev_sentences_wikipedia
dev_evaluator_mse = evaluation.MSEEvaluator(dev_sentences, dev_sentences, teacher_model=teacher_model)

# Train the student model to imitate the teacher
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluation.SequentialEvaluator([dev_evaluator_sts, dev_evaluator_mse]),
                  epochs=1,
                  warmup_steps=1000,
                  evaluation_steps=5000,
                  output_path=output_path,
                  save_best_model=True,
                  optimizer_params={'lr': 1e-4, 'eps': 1e-6, 'correct_bias': False},
                  use_amp=True)

Esempio n. 4
0
                         max_sentences=None,
                         max_sentence_length=train_conf.getint('MaxSentLen'))
    train_dataloader = DataLoader(train_data,
                                  batch_size=train_conf.getint('BatchSize'))

    #train_loss = CosineSimilarityLoss(model=student_model)
    train_loss = losses.MSELoss(model=student)

    logging.info('Assembling evaluator')
    df = pd.read_csv(data_path / datasets['DevSet'],
                     sep='\t',
                     header=None,
                     quoting=3)
    dev_mse_evaluator = evaluation.MSEEvaluator(
        df.iloc[:, 0],
        df.iloc[:, 1],
        name='Dev-MSE-evaluator',
        teacher_model=teacher,
        batch_size=eval_conf.getint('BatchSize'))

    logging.info('Fitting..')
    dt = datetime.datetime.now(tz=datetime.timezone(datetime.timedelta(
        hours=2))).strftime("%Y-%m-%d-%H:%M:%S")
    output_path = output_path / dt

    student.fit(train_objectives=[(train_dataloader, train_loss)],
                evaluator=dev_mse_evaluator,
                epochs=train_conf.getint('Epochs'),
                steps_per_epoch=train_conf.getint('Steps'),
                scheduler=config['SCHEDULER']['Scheduler'],
                warmup_steps=train_conf.getint('WarmUp'),
                evaluation_steps=eval_conf.getint('EvalSteps'),