word_embedding_model = models.Transformer(student_model_name,
                                          max_seq_length=max_seq_length)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(
    modules=[word_embedding_model, pooling_model])

###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(student_model=student_model,
                                      teacher_model=teacher_model,
                                      batch_size=inference_batch_size,
                                      use_embedding_cache=True)
for train_file in train_files:
    train_data.load_data(train_file,
                         max_sentences=max_sentences_per_language,
                         max_sentence_length=train_max_sentence_length)

train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)

#### Evaluate cross-lingual performance on different tasks #####
evaluators = [
]  #evaluators has a list of different evaluator classes we call periodically

for dev_file in dev_files:
    logging.info("Create evaluator for " + dev_file)
    src_sentences = []
    trg_sentences = []
Example #2
0
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# logging.info("Loading previously trained student-teacher model")
# model = SentenceTransformer('models/hindi-sxlmr-stmodel')

output_path = 'models/se-asian-sbert'

logging.info("Create dataset reader")

###### Read Dataset ######
train_file_path = 'train_southeast_asian_parallel_corpus.txt'
train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
train_data.load_data(train_file_path)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)

###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read dev dataset")
evaluators = []
claim_pair_reader = ClaimPairDataReader()
dev_data = SentencesDataset(examples=claim_pair_reader.get_examples(split='train', language='hi'), model=model)
# dev_file_path = 'test_southeast_asian_parallel_corpus.txt'
# dev_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
# dev_data.load_data(dev_file_path)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

output_path = "output/make-multilingual-" + datetime.now().strftime(
    "%Y-%m-%d_%H-%M-%S")

logging.info("Create dataset reader")

###### Read Dataset ######
train_data = ParallelSentencesDataset(student_model=model,
                                      teacher_model=teacher_model)
for train_file in train_files:
    train_data.load_data(train_file)

train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)

# The below all apply to the de example - how does one evaluate the model outside this single example???
###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read data/hindi_sbert_sts_train.csv dataset")
evaluators = []
sts_reader = readers.STSDataReader('./data/',
                                   s1_col_idx=0,
                                   s2_col_idx=1,
Example #4
0
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

output_path = "output/make-multilingual-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

logging.info("Create dataset reader")


###### Read Dataset ######
train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
for train_file in train_files:
    train_data.load_data(train_file)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)


###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read STS2017.en-de dataset")
evaluators = []
sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2)
dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='STS2017.en-de')
evaluators.append(evaluator_sts)
Example #5
0
    logger.info('Loading teacher model...')
    teacher = SentenceTransformer(model_names['Teacher'])

    logger.info('Loading student model...')
    embedding_model = models.Transformer(
        model_names['Student'], max_seq_length=model_conf.getint('MaxSeqLen'))
    pooling = models.Pooling(embedding_model.get_word_embedding_dimension())
    student = SentenceTransformer(modules=[embedding_model, pooling])

    logger.info('Loading training set...')
    train_data = ParallelSentencesDataset(student_model=student,
                                          teacher_model=teacher)
    # ParallelSentencesDataset can't handle PosixPaths, therefore cast to string
    train_set_path = str(data_path) + '/' + datasets['TrainSet']
    train_data.load_data(train_set_path,
                         max_sentences=None,
                         max_sentence_length=train_conf.getint('MaxSentLen'))
    train_dataloader = DataLoader(train_data,
                                  batch_size=train_conf.getint('BatchSize'))

    #train_loss = CosineSimilarityLoss(model=student_model)
    train_loss = losses.MSELoss(model=student)

    logging.info('Assembling evaluator')
    df = pd.read_csv(data_path / datasets['DevSet'],
                     sep='\t',
                     header=None,
                     quoting=3)
    dev_mse_evaluator = evaluation.MSEEvaluator(
        df.iloc[:, 0],
        df.iloc[:, 1],