Beispiel #1
0
    def train(self, train_df, eval_df):
        """

        :param train_df: dataframe with columns 'text_a', 'text_b', 'labels'
        :param eval_df: dataframe with columns 'text_a', 'text_b', 'labels'
        :return:
        """

        # format training data
        if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in train_df.columns:
            if self.args.do_lower_case:
                train_df.loc[:, 'text_a'] = train_df['text_a'].str.lower()
                train_df.loc[:, 'text_b'] = train_df['text_b'].str.lower()

            train_examples = [
                InputExample(str(i), [text_a, text_b], label)
                for i, (text_a, text_b, label) in enumerate(
                    zip(
                        train_df["text_a"].astype(str),
                        train_df["text_b"].astype(str),
                        train_df["labels"].astype(float),
                    ))
            ]
        else:
            raise KeyError(
                'Training data processing - Required columns not found!')

        # format evaluation data
        if "text_a" in train_df.columns and "text_b" in train_df.columns and "labels" in eval_df.columns:
            if self.args.do_lower_case:
                eval_df.loc[:, 'text_a'] = eval_df['text_a'].str.lower()
                eval_df.loc[:, 'text_b'] = eval_df['text_b'].str.lower()

            evaluator = evaluation.EmbeddingSimilarityEvaluator(
                list(eval_df["text_a"]),
                list(eval_df["text_b"]),
                list(eval_df["labels"]),
                batch_size=self.args.eval_batch_size)
        else:
            raise KeyError(
                'Evaluation data processing - Required columns not found!')

        # Define train dataset, the dataloader and the train loss
        train_dataloader = DataLoader(train_examples,
                                      shuffle=True,
                                      batch_size=self.args.train_batch_size)
        train_loss = losses.CosineSimilarityLoss(self.model)

        # Tune the model
        self.model.fit(
            train_objectives=[(train_dataloader, train_loss)],
            epochs=self.args.num_train_epochs,
            warmup_steps=self.args.warmup_steps,
            optimizer_params={'lr': self.args.learning_rate},
            weight_decay=self.args.weight_decay,
            evaluator=evaluator,
            evaluation_steps=self.args.evaluate_during_training_steps,
            max_grad_norm=self.args.max_grad_norm,
            output_path=self.args.best_model_dir,
            show_progress_bar=self.args.show_progress_bar)
                    'scores': []
                }

                fIn = zip.open(filepath)
                for line in io.TextIOWrapper(fIn, 'utf8'):
                    sent1, sent2, score = line.strip().split("\t")
                    score = float(score)
                    sts_data[filename]['sentences1'].append(sent1)
                    sts_data[filename]['sentences2'].append(sent2)
                    sts_data[filename]['scores'].append(score)

for filename, data in sts_data.items():
    test_evaluator = evaluation.EmbeddingSimilarityEvaluator(
        data['sentences1'],
        data['sentences2'],
        data['scores'],
        batch_size=inference_batch_size,
        name=filename,
        show_progress_bar=False)
    evaluators.append(test_evaluator)

# Train the model
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluation.SequentialEvaluator(
                      evaluators,
                      main_score_function=lambda scores: np.mean(scores)),
                  epochs=num_epochs,
                  warmup_steps=num_warmup_steps,
                  evaluation_steps=num_evaluation_steps,
                  output_path=output_path,
                  save_best_model=True,
Beispiel #3
0
corpus = pd.read_csv('CDSCorpus/CDS_train.csv',sep='\t',error_bad_lines=False, encoding='utf-8')# ,nrows=1000  
corpus['relatedness_score'] = corpus['relatedness_score'].div(5)
corpus_test = pd.read_csv('CDSCorpus/CDS_test.csv',sep='\t',error_bad_lines=False, encoding='utf-8')  
corpus_test['relatedness_score'] = corpus_test['relatedness_score'].div(5)
# label2int = {"CONTRADICTION": 0, "ENTAILMENT": 1, "NEUTRAL": 2}
s1=[]
s2=[]
sc=[]
s3=[]
for index, row in corpus_test.iterrows():
    s1.append(row['sentence_A'])
    s2.append(row['sentence_B'])
    sc.append(row['relatedness_score'])
    # sc.append(label2int[row['entailment_judgment']])

evaluator = evaluation.EmbeddingSimilarityEvaluator(s1, s2, sc)# then change to corpus_test data


# roberta_large requires more gpu memory
word_embedding_model = models.Transformer('roberta_base', max_seq_length=512)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

train_examples = []
test_examples = []
for index, row in corpus.iterrows():
    train_examples.append(InputExample(texts=[row['sentence_A'], row['sentence_B']], label=row['relatedness_score'])) 
    s3.append(row['sentence_A'])
    s3.append(row['sentence_B'])
    
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=8)
Beispiel #4
0
    print(f"Dataset length: {len(dataset)}")
    return dataset


#%%
train_cluster_dataset = generate_dataset(grouped)
test_cluster_dataset = generate_dataset(grouped_test)

# train_dataset, test_dataset = train_test_split(dataset, train_size=0.8)
train_dataset = train_cluster_dataset
test_dataset = test_cluster_dataset
train_dataloader = DataLoader(train_cluster_dataset,
                              shuffle=True,
                              batch_size=16)
evaluator = evaluation.EmbeddingSimilarityEvaluator(
    [i.texts[0] for i in test_dataset], [i.texts[1] for i in test_dataset],
    [i.label for i in test_dataset])

# %%
train_loss = losses.CosineSimilarityLoss(model)

#%%
output_dir = args.output
try:
    os.mkdir(output_dir)
except:
    pass

try:
    os.mkdir(output_dir + '/best')
except:
train_data.load_data(train_file_path)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)

###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read dev dataset")
evaluators = []
claim_pair_reader = ClaimPairDataReader()
dev_data = SentencesDataset(examples=claim_pair_reader.get_examples(split='train', language='hi'), model=model)
# dev_file_path = 'test_southeast_asian_parallel_corpus.txt'
# dev_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
# dev_data.load_data(dev_file_path)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='SE Asian Test Data')
evaluators.append(evaluator_sts)

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]),
          epochs=2,
          evaluation_steps=1000,
          warmup_steps=10000,
          scheduler='warmupconstant',
          output_path=output_path,
          save_best_model=True,
          optimizer_params={'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
          )
            t1 = load_dict['images'][i]['sentences'][j]['raw'].lower()
            t2 = load_dict['images'][i]['sentences'][k]['raw'].lower()
            train_examples.append(InputExample(texts=[t1, t2], label=1))
#     for k in range(i + 1, len(load_dict['images'])):
#         t1 = load_dict['images'][i]['sentences'][0]['raw'].lower()
#         t2 = load_dict['images'][k]['sentences'][0]['raw'].lower()
#         train_examples.append(InputExample(texts=[t1, t2], label=0))

train_dataset = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=64)
train_loss = losses.ContrastiveLoss(model=model)
#Tune the model
sentences1 = [
    'This list contains the first column', 'With your sentences',
    'You want your model to evaluate on'
]
sentences2 = [
    'Sentences contains the other column',
    'The evaluator matches sentences1[i] with sentences2[i]',
    'Compute the cosine similarity and compares it to scores[i]'
]
scores = [0.3, 0.6, 0.2]
evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2,
                                                    scores)
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=10,
          warmup_steps=100,
          output_path="./flickerbertmodel/",
          evaluator=evaluator,
          evaluation_steps=5)
Beispiel #7
0
for train_file in train_files:
    train_data.load_data(train_file)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)


###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read STS2017.en-de dataset")
evaluators = []
sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2)
dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='STS2017.en-de')
evaluators.append(evaluator_sts)


# Use XLNI.en-de dataset with MSE evaluation
logging.info("Read XNLI.en-de dataset")
xnli_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
xnli_reader.load_data('../datasets/xnli-en-de.txt.gz')

xnli_dataloader = DataLoader(xnli_reader, shuffle=False, batch_size=train_batch_size)
xnli_mse = evaluation.MSEEvaluator(xnli_dataloader, name='xnli-en-de')
evaluators.append(xnli_mse)



# Train the model
# The below all apply to the de example - how does one evaluate the model outside this single example???
###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read data/hindi_sbert_sts_train.csv dataset")
evaluators = []
sts_reader = readers.STSDataReader('./data/',
                                   s1_col_idx=0,
                                   s2_col_idx=1,
                                   score_col_idx=2)
dev_data = SentencesDataset(
    examples=sts_reader.get_examples('hindi_sbert_sts_train.csv'), model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
                            batch_size=train_batch_size)
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(
    dev_dataloader, name='Hindi_Headlines_en_hi_sbert')
evaluators.append(evaluator_sts)

model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluation.SequentialEvaluator(
              evaluators, main_score_function=lambda scores: scores[-1]),
          epochs=20,
          evaluation_steps=1000,
          warmup_steps=10000,
          scheduler='warmupconstant',
          output_path=output_path,
          save_best_model=True,
          optimizer_params={
              'lr': 2e-5,
              'eps': 1e-6,
              'correct_bias': False