Esempio n. 1
0
sts_dataset_path = 'data/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                  sts_dataset_path)

train_samples = []
dev_samples = []
test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']],
                                   label=score)

        if row['split'] == 'dev':
            dev_samples.append(inp_example)
        elif row['split'] == 'test':
            test_samples.append(inp_example)
        else:
            train_samples.append(inp_example)

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples,
                                                             name='sts-dev')
model.evaluate(evaluator)

evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples,
                                                             name='sts-test')
model.evaluate(evaluator)
Esempio n. 2
0
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples('sts-dev.csv'), name='sts-dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(
    len(train_data) * num_epochs / train_batch_size *
    0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)
Esempio n. 3
0
def test(model_save_path, sts_dataset_path, train_batch_size):
    test_samples = read_dataset(sts_dataset_path, "test")
    model = SentenceTransformer(model_save_path)
    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        test_samples, batch_size=train_batch_size, name='sts-test')
    test_evaluator(model, output_path=model_save_path)
Esempio n. 4
0
                    out_features=sent_embeddings_dimension)
dan2 = models.Dense(in_features=sent_embeddings_dimension,
                    out_features=sent_embeddings_dimension)

model = SentenceTransformer(
    modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                              model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples('sts-dev.csv'))

# Configure the training
num_epochs = 10
warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

##############################################################################
    # Convert the dataset to a DataLoader ready for training
    logging.info("Read CSTS-B  train dataset")
    train_data = SentencesDataset(
        csts_reader.get_examples('cnsd-sts-train.txt'), model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    logging.info("Read CSTS-B dev dataset")
    dev_data = SentencesDataset(
        examples=csts_reader.get_examples('cnsd-sts-dev.txt'), model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=train_batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    # Configure the training. We skip evaluation in this example
    warmup_steps = math.ceil(
        len(train_data) * num_epochs / train_batch_size *
        0.1)  # 10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=100,
              warmup_steps=warmup_steps,
              output_path=model_save_path)
Esempio n. 6
0
                              model,
                              show_progress_bar=True)
pickle.dump(train_data, open("train_data.pkl", "wb"))
# train_data = pickle.load(open("train_data.pkl", "rb"))

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

dev_data = SentencesDataset(
    all_examples[train_dev_test_split[0]:train_dev_test_split[1]],
    model,
    show_progress_bar=True)
dev_sampler = RandomSampler(dev_data, replacement=True, num_samples=2000)
dev_loader = DataLoader(dev_data, batch_size=16, sampler=dev_sampler)

train_loss = losses.CosineSimilarityLoss(model=model)
evaluator = EmbeddingSimilarityEvaluator(dev_loader,
                                         show_progress_bar=True,
                                         device=device)

model.fit(train_objectives=[(train_loader, train_loss)],
          evaluator=evaluator,
          epochs=10,
          evaluation_steps=2000,
          warmup_steps=int(217206 / 5),
          output_path="train_sampled_eval4",
          optimizer_params={
              'lr': 2e-5,
              'eps': 1e-6,
              'correct_bias': False
          })
train_loss = losses.MultipleNegativesRankingLoss(model)

#Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'dev':
            score = float(
                row['score']) / 5.0  #Normalize score to range 0 ... 1
            dev_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=score))

dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    dev_samples, batch_size=train_batch_size, name='sts-dev')

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    evaluator=dev_evaluator,
    epochs=num_epochs,
    evaluation_steps=int(len(train_dataloader) * 0.1),
    warmup_steps=warmup_steps,
    output_path=model_save_path,
    use_amp=False  #Set to True, if your GPU supports FP16 operations
Esempio n. 8
0
import torch
from torch.utils.data import DataLoader, RandomSampler

from scipy.spatial.distance import cdist

torch.cuda.empty_cache()

my_model_path = 'msmarco/models/test_model5'

model_1 = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

dev_dataloader = torch.load(os.path.join('msmarco/models/test_model5', 'dev_dataloader.pth'))
train_dataloader =  torch.load(os.path.join('msmarco/models/test_model5', 'train_dataloader.pth'))

evaluator1 = BinaryEmbeddingSimilarityEvaluator(dev_dataloader)
evaluator2 = EmbeddingSimilarityEvaluator(dev_dataloader)
evaluator = SequentialEvaluator([evaluator1, evaluator2], main_score_function = lambda scores: scores[0])

optimizer_class = transformers.AdamW
optimizer_params = {'lr': 2e-6, 'eps': 1e-6, 'correct_bias': False}
train_loss = losses.CosineSimilarityLoss(model=model_1)

num_epochs = 100
warmup_steps = math.ceil(len(train_dataloader.dataset)*num_epochs / train_dataloader.batch_size*0.1) #10% of train data for warm-up

model_1.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          steps_per_epoch=1000,
          warmup_steps=warmup_steps,
          optimizer_class=optimizer_class,