コード例 #1
0
    modules=[word_embedding_model, pooling_model])

###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(student_model=student_model,
                                      teacher_model=teacher_model,
                                      batch_size=inference_batch_size,
                                      use_embedding_cache=True)
for train_file in train_files:
    train_data.load_data(train_file,
                         max_sentences=max_sentences_per_language,
                         max_sentence_length=train_max_sentence_length)

train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.MSELoss(model=student_model)

#### Evaluate cross-lingual performance on different tasks #####
evaluators = [
]  #evaluators has a list of different evaluator classes we call periodically

for dev_file in dev_files:
    logging.info("Create evaluator for " + dev_file)
    src_sentences = []
    trg_sentences = []
    with gzip.open(dev_file, 'rt', encoding='utf8') as fIn:
        for line in fIn:
            splits = line.strip().split('\t')
            if splits[0] != "" and splits[1] != "":
                src_sentences.append(splits[0])
                trg_sentences.append(splits[1])
コード例 #2
0
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
# logging.info("Loading previously trained student-teacher model")
# model = SentenceTransformer('models/hindi-sxlmr-stmodel')

output_path = 'models/se-asian-sbert'

logging.info("Create dataset reader")

###### Read Dataset ######
train_file_path = 'train_southeast_asian_parallel_corpus.txt'
train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
train_data.load_data(train_file_path)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)

###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read dev dataset")
evaluators = []
claim_pair_reader = ClaimPairDataReader()
dev_data = SentencesDataset(examples=claim_pair_reader.get_examples(split='train', language='hi'), model=model)
# dev_file_path = 'test_southeast_asian_parallel_corpus.txt'
# dev_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
# dev_data.load_data(dev_file_path)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='SE Asian Test Data')
evaluators.append(evaluator_sts)