######## Start the extension of the teacher model to multiple languages ######## logging.info("Load teacher model") teacher_model = SentenceTransformer(teacher_model_name) logging.info("Create student model from scratch") word_embedding_model = models.Transformer(student_model_name, max_seq_length=max_seq_length) # Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling( word_embedding_model.get_word_embedding_dimension()) student_model = SentenceTransformer( modules=[word_embedding_model, pooling_model]) ###### Read Parallel Sentences Dataset ###### train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=True) for train_file in train_files: train_data.load_data(train_file, max_sentences=max_sentences_per_language, max_sentence_length=train_max_sentence_length) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=student_model) #### Evaluate cross-lingual performance on different tasks ##### evaluators = [ ] #evaluators has a list of different evaluator classes we call periodically
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) # logging.info("Loading previously trained student-teacher model") # model = SentenceTransformer('models/hindi-sxlmr-stmodel') output_path = 'models/se-asian-sbert' logging.info("Create dataset reader") ###### Read Dataset ###### train_file_path = 'train_southeast_asian_parallel_corpus.txt' train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) train_data.load_data(train_file_path) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=model) ###### Load dev sets ###### # Test on STS 2017.en-de dataset using Spearman rank correlation logging.info("Read dev dataset") evaluators = [] claim_pair_reader = ClaimPairDataReader() dev_data = SentencesDataset(examples=claim_pair_reader.get_examples(split='train', language='hi'), model=model) # dev_file_path = 'test_southeast_asian_parallel_corpus.txt' # dev_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) # dev_data.load_data(dev_file_path)
# Apply mean pooling to get one fixed sized sentence vector pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), pooling_mode_mean_tokens=True, pooling_mode_cls_token=False, pooling_mode_max_tokens=False) model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) output_path = "output/make-multilingual-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") logging.info("Create dataset reader") ###### Read Dataset ###### train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) for train_file in train_files: train_data.load_data(train_file) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=model) ###### Load dev sets ###### # Test on STS 2017.en-de dataset using Spearman rank correlation logging.info("Read STS2017.en-de dataset") evaluators = [] sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2) dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model) dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
pca.fit(pca_embeddings) #Add Dense layer to teacher that projects the embeddings down to the student embedding size dense = models.Dense(in_features=teacher_model.get_sentence_embedding_dimension(), out_features=student_model.get_sentence_embedding_dimension(), bias=False, activation_function=torch.nn.Identity()) dense.linear.weight = torch.nn.Parameter(torch.tensor(pca.components_)) teacher_model.add_module('dense', dense) logging.info("Teacher Performance with {} dimensions:".format(teacher_model.get_sentence_embedding_dimension())) dev_evaluator_sts(teacher_model) # We train the student_model such that it creates sentence embeddings similar to the embeddings from the teacher_model # For this, we need a large set of sentences. These sentences are embedded using the teacher model, # and the student tries to mimic these embeddings. It is the same approach as used in: https://arxiv.org/abs/2004.09813 train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=inference_batch_size, use_embedding_cache=False) train_data.add_dataset([[sent] for sent in train_sentences_nli], max_sentence_length=256) train_data.add_dataset([[sent] for sent in train_sentences_wikipedia], max_sentence_length=256) train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) train_loss = losses.MSELoss(model=student_model) # We create an evaluator, that measure the Mean Squared Error (MSE) between the teacher and the student embeddings dev_sentences = dev_sentences_nli + dev_sentences_wikipedia dev_evaluator_mse = evaluation.MSEEvaluator(dev_sentences, dev_sentences, teacher_model=teacher_model) # Train the student model to imitate the teacher student_model.fit(train_objectives=[(train_dataloader, train_loss)], evaluator=evaluation.SequentialEvaluator([dev_evaluator_sts, dev_evaluator_mse]), epochs=1, warmup_steps=1000,
logger.info('Downloading required datasets..') download_datasets(datasets) # print(config.sections()) logger.info('Loading teacher model...') teacher = SentenceTransformer(model_names['Teacher']) logger.info('Loading student model...') embedding_model = models.Transformer( model_names['Student'], max_seq_length=model_conf.getint('MaxSeqLen')) pooling = models.Pooling(embedding_model.get_word_embedding_dimension()) student = SentenceTransformer(modules=[embedding_model, pooling]) logger.info('Loading training set...') train_data = ParallelSentencesDataset(student_model=student, teacher_model=teacher) # ParallelSentencesDataset can't handle PosixPaths, therefore cast to string train_set_path = str(data_path) + '/' + datasets['TrainSet'] train_data.load_data(train_set_path, max_sentences=None, max_sentence_length=train_conf.getint('MaxSentLen')) train_dataloader = DataLoader(train_data, batch_size=train_conf.getint('BatchSize')) #train_loss = CosineSimilarityLoss(model=student_model) train_loss = losses.MSELoss(model=student) logging.info('Assembling evaluator') df = pd.read_csv(data_path / datasets['DevSet'], sep='\t', header=None,