######## Start the extension of the teacher model to multiple languages ########
logging.info("Load teacher model")
teacher_model = SentenceTransformer(teacher_model_name)

logging.info("Create student model from scratch")
word_embedding_model = models.Transformer(student_model_name,
                                          max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(
    modules=[word_embedding_model, pooling_model])

###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(student_model=student_model,
                                      teacher_model=teacher_model,
                                      batch_size=inference_batch_size,
                                      use_embedding_cache=True)
for train_file in train_files:
    train_data.load_data(train_file,
                         max_sentences=max_sentences_per_language,
                         max_sentence_length=train_max_sentence_length)

train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)

#### Evaluate cross-lingual performance on different tasks #####
evaluators = [
]  #evaluators has a list of different evaluator classes we call periodically
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# logging.info("Loading previously trained student-teacher model")
# model = SentenceTransformer('models/hindi-sxlmr-stmodel')

output_path = 'models/se-asian-sbert'

logging.info("Create dataset reader")

###### Read Dataset ######
train_file_path = 'train_southeast_asian_parallel_corpus.txt'
train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
train_data.load_data(train_file_path)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)

###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read dev dataset")
evaluators = []
claim_pair_reader = ClaimPairDataReader()
dev_data = SentencesDataset(examples=claim_pair_reader.get_examples(split='train', language='hi'), model=model)
# dev_file_path = 'test_southeast_asian_parallel_corpus.txt'
# dev_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
# dev_data.load_data(dev_file_path)
Exemple #3
0
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

output_path = "output/make-multilingual-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logging.info("Create dataset reader")


###### Read Dataset ######
train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
for train_file in train_files:
    train_data.load_data(train_file)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)


###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read STS2017.en-de dataset")
evaluators = []
sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2)
dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
    pca.fit(pca_embeddings)

    #Add Dense layer to teacher that projects the embeddings down to the student embedding size
    dense = models.Dense(in_features=teacher_model.get_sentence_embedding_dimension(), out_features=student_model.get_sentence_embedding_dimension(), bias=False, activation_function=torch.nn.Identity())
    dense.linear.weight = torch.nn.Parameter(torch.tensor(pca.components_))
    teacher_model.add_module('dense', dense)

    logging.info("Teacher Performance with {} dimensions:".format(teacher_model.get_sentence_embedding_dimension()))
    dev_evaluator_sts(teacher_model)



# We train the student_model such that it creates sentence embeddings similar to the embeddings from the teacher_model
# For this, we need a large set of sentences. These sentences are embedded using the teacher model,
# and the student tries to mimic these embeddings. It is the same approach as used in: https://arxiv.org/abs/2004.09813
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=False)
train_data.add_dataset([[sent] for sent in train_sentences_nli], max_sentence_length=256)
train_data.add_dataset([[sent] for sent in train_sentences_wikipedia], max_sentence_length=256)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)

# We create an evaluator, that measure the Mean Squared Error (MSE) between the teacher and the student embeddings
dev_sentences = dev_sentences_nli + dev_sentences_wikipedia
dev_evaluator_mse = evaluation.MSEEvaluator(dev_sentences, dev_sentences, teacher_model=teacher_model)

# Train the student model to imitate the teacher
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluation.SequentialEvaluator([dev_evaluator_sts, dev_evaluator_mse]),
                  epochs=1,
                  warmup_steps=1000,
Exemple #5
0
    logger.info('Downloading required datasets..')
    download_datasets(datasets)

    # print(config.sections())

    logger.info('Loading teacher model...')
    teacher = SentenceTransformer(model_names['Teacher'])

    logger.info('Loading student model...')
    embedding_model = models.Transformer(
        model_names['Student'], max_seq_length=model_conf.getint('MaxSeqLen'))
    pooling = models.Pooling(embedding_model.get_word_embedding_dimension())
    student = SentenceTransformer(modules=[embedding_model, pooling])

    logger.info('Loading training set...')
    train_data = ParallelSentencesDataset(student_model=student,
                                          teacher_model=teacher)
    # ParallelSentencesDataset can't handle PosixPaths, therefore cast to string
    train_set_path = str(data_path) + '/' + datasets['TrainSet']
    train_data.load_data(train_set_path,
                         max_sentences=None,
                         max_sentence_length=train_conf.getint('MaxSentLen'))
    train_dataloader = DataLoader(train_data,
                                  batch_size=train_conf.getint('BatchSize'))

    #train_loss = CosineSimilarityLoss(model=student_model)
    train_loss = losses.MSELoss(model=student)

    logging.info('Assembling evaluator')
    df = pd.read_csv(data_path / datasets['DevSet'],
                     sep='\t',
                     header=None,