def pretrained_model_score(self, model_name, expected_score):
        model = SentenceTransformer(model_name)
        sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

        if not os.path.exists(sts_dataset_path):
            util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz',
                          sts_dataset_path)

        train_samples = []
        dev_samples = []
        test_samples = []
        with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
            reader = csv.DictReader(fIn,
                                    delimiter='\t',
                                    quoting=csv.QUOTE_NONE)
            for row in reader:
                score = float(
                    row['score']) / 5.0  # Normalize score to range 0 ... 1
                inp_example = InputExample(
                    texts=[row['sentence1'], row['sentence2']], label=score)

                if row['split'] == 'dev':
                    dev_samples.append(inp_example)
                elif row['split'] == 'test':
                    test_samples.append(inp_example)
                else:
                    train_samples.append(inp_example)

        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            test_samples, name='sts-test')

        score = model.evaluate(evaluator) * 100
        print(model_name,
              "{:.2f} vs. exp: {:.2f}".format(score, expected_score))
        assert score > expected_score or abs(score - expected_score) < 0.1
 def evaluate_stsb_test(self, model, expected_score):
     evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
         self.test_samples, name='sts-test')
     score = model.evaluate(evaluator) * 100
     print("STS-Test Performance: {:.2f} vs. exp: {:.2f}".format(
         score, expected_score))
     assert score > expected_score or abs(score - expected_score) < 0.1
def create_posts_ranking(fl, data_dir, model, validate=None, is_test=False):
    train_posts_ranking = []
    disbn = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        for obj in data:
            answers = obj['answers']
            filtered_answers = []
            votes = 1000000
            for answer in answers:
                my_votes = answer['a_votes']
                if my_votes < votes:
                    votes = my_votes
                    filtered_answers.append(answer)

            if len(filtered_answers) > 1:
                rank = len(filtered_answers)
                for answer in filtered_answers:
                    dist = rank / len(filtered_answers)
                    disbn.append(answer['a_rank'])
                    rank = rank - 1
                    train_posts_ranking.append(
                        InputExample(texts=[obj['q_text'], answer['a_text']],
                                     label=dist))

    random.shuffle(train_posts_ranking)

    print("data size " + str(len(train_posts_ranking)))

    if is_test:
        return train_posts_ranking

    if max_size:
        train_posts_ranking = train_posts_ranking[:max_size]

    evaluator = None
    if posts_rank_str == validate:
        train_posts_ranking, dev_posts_ranking = train_test_split(
            train_posts_ranking, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_posts_ranking, name='posts ranking')

    warmup_steps = math.ceil(
        len(train_posts_ranking) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_posts_ranking = SentencesDataset(train_posts_ranking,
                                                model=model)
    train_dataloader_posts_ranking = DataLoader(train_data_posts_ranking,
                                                shuffle=True,
                                                batch_size=batch_size)
    train_loss_posts_ranking = losses.CosineSimilarityLoss(model=model)

    print('R: Number of training examples: ', len(train_posts_ranking))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_posts_ranking) / 0.1)

    return train_dataloader_posts_ranking, train_loss_posts_ranking, evaluator, warmup_steps
Esempio n. 4
0
def fit_model(trial, train_fold, val_fold, fold_index):
    print("######################")
    print("start of fold_index:", fold_index)
    print("len(train_fold)", len(train_fold))
    print("len(val_fold)", len(val_fold))

    batch_size = trial.suggest_int("train_batch_size", 4, 50)
    num_epochs = trial.suggest_int("num_epochs", 1, 4)
    lr = trial.suggest_uniform("lr", 2e-6, 2e-4)
    eps = trial.suggest_uniform("eps", 1e-7, 1e-5)
    weight_decay = trial.suggest_uniform("weight_decay", 0.001, 0.1)
    warmup_steps_mul = trial.suggest_uniform("warmup_steps_mul", 0.1, 0.5)

    model = SentenceTransformer(model_name)

    # create train dataloader
    # train_sentece_dataset = SentencesDataset(train_fold, model=model) # this is deprecated
    train_dataloader = DataLoader(train_fold,
                                  shuffle=True,
                                  batch_size=batch_size)

    # define loss
    train_loss = losses.CosineSimilarityLoss(model=model)

    warmup_steps = math.ceil(
        len(train_fold) * num_epochs / batch_size * warmup_steps_mul)

    # Train the model
    model.fit(
        train_objectives=[(train_dataloader, train_loss)],
        evaluator=None,
        epochs=num_epochs,
        warmup_steps=warmup_steps,
        optimizer_params={
            "lr": lr,
            "eps": eps,
            "correct_bias": False
        },
        weight_decay=weight_decay,
    )

    # evaluate the model
    val_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        val_fold, name="val_set", main_similarity=SimilarityFunction.COSINE)
    result = val_evaluator(model)

    print("######################################################")
    print("test result:", result)
    print("######################################################")

    if math.isnan(result):
        result = 0.0

    return result
def create_hirerachy_examples(fl,
                              data_dir,
                              model,
                              validate=None,
                              is_test=False):
    train_hierarchy_samples = []
    disbn = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        max_distance = 0
        for obj in data:
            if obj['distance'] > max_distance:
                max_distance = obj['distance']
        for obj in data:
            # flip the meaning of similarity, since the more distant the two classes, the closer to 0 it should be
            dist = (max_distance - obj['distance']) / (max_distance - 1)
            train_hierarchy_samples.append(
                InputExample(texts=[obj['class1'], obj['class2']], label=dist))
            disbn.append(obj['distance'])
    random.shuffle(train_hierarchy_samples)
    train_hierarchy_samples = train_hierarchy_samples[:100000]
    disbn = disbn[:100000]

    if max_size:
        train_hierarchy_samples = train_hierarchy_samples[:max_size]
        disbn = disbn[:max_size]

    if is_test:
        return train_hierarchy_samples

    evaluator = None

    if hierarchy_str == validate:
        train_hierarchy_samples, dev_hierarchy_samples = train_test_split(
            train_hierarchy_samples, stratify=disbn, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_hierarchy_samples, name='hierarchy')

    warmup_steps = math.ceil(
        len(train_hierarchy_samples) * num_epochs / batch_size *
        0.1)  # 10% of train data for warm-up

    train_data_hierarchy = SentencesDataset(train_hierarchy_samples,
                                            model=model)
    train_dataloader_hierarchy = DataLoader(train_data_hierarchy,
                                            shuffle=True,
                                            batch_size=batch_size)
    train_loss_hierarchy = losses.CosineSimilarityLoss(model=model)

    print('H: Number of training examples: ', len(train_hierarchy_samples))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_hierarchy_samples) / 0.1)
    return train_dataloader_hierarchy, train_loss_hierarchy, evaluator, warmup_steps
def create_train_usage(fl, data_dir, model, validate=None, is_test=False):
    train_usage = []
    with open(os.path.join(data_dir, fl)) as f:
        data = json.load(f)
        min_d = 10000000
        max_d = 0
        for obj in data:
            dist = obj['distance']
            if dist < min_d:
                min_d = dist
            if dist > max_d:
                max_d = dist
        for obj in data:
            dist = (max_d - obj['distance']) / (max_d - min_d)
            train_usage.append(
                InputExample(texts=[obj['class1'], obj['class2']], label=dist))

    random.shuffle(train_usage)

    if is_test:
        return train_usage

    if max_size:
        train_usage = train_usage[:max_size]

    evaluator = None

    if usage_str == validate:
        train_usage, dev_usage = train_test_split(train_usage, test_size=0.1)
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            dev_usage, name='usage')
    warmup_steps = math.ceil(len(train_usage) * num_epochs / batch_size *
                             0.1)  # 10% of train data for warm-up

    train_data_usage = SentencesDataset(train_usage, model=model)
    train_dataloader_usage = DataLoader(train_data_usage,
                                        shuffle=True,
                                        batch_size=batch_size)
    train_loss_usage = losses.CosineSimilarityLoss(model=model)

    print('U: Number of training examples: ', len(train_usage))

    global evaluation_steps
    evaluation_steps = math.ceil(len(train_usage) / 0.1)

    return train_dataloader_usage, train_loss_usage, evaluator, warmup_steps
Esempio n. 7
0
def test_model(model_dir, languages, crossings, params):
    # load all data incl. crossings
    test_data_stsb_cross = load_test_data(languages, crossings)
    assert len(test_data_stsb_cross) == 4
    assert len(test_data_stsb_cross[0]) == 1379

    lang_1_stsb = test_data_stsb_cross[0]
    lang_2_stsb = test_data_stsb_cross[1]
    assert len(lang_1_stsb) == len(lang_2_stsb) == 1379

    # test data of crossings only
    lang_cross_stsb = list(
        itertools.chain.from_iterable(test_data_stsb_cross[2:]))
    assert len(lang_cross_stsb) == 1379 * 2

    # test data of lang1, lang1 and crossings
    lang_all_stsb = list(itertools.chain.from_iterable(test_data_stsb_cross))
    assert len(lang_all_stsb) == 1379 * 4

    # convert to sentence_transformers datasets
    lang_1_stsb = to_input_example(lang_1_stsb)
    lang_2_stsb = to_input_example(lang_2_stsb)
    lang_cross_stsb = to_input_example(lang_cross_stsb)
    lang_all_stsb = to_input_example(lang_all_stsb)

    # load model from dir
    model = SentenceTransformer(model_dir)

    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        lang_1_stsb,
        name="test_lang_1_stsb",
        main_similarity=SimilarityFunction.COSINE)
    result_lang_1_stsb = test_evaluator(model)

    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        lang_2_stsb,
        name="test_lang_2_stsb",
        main_similarity=SimilarityFunction.COSINE)
    result_lang_2_stsb = test_evaluator(model)

    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        lang_cross_stsb,
        name="test_lang_cross_stsb",
        main_similarity=SimilarityFunction.COSINE,
    )
    result_lang_cross_stsb = test_evaluator(model)

    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        lang_all_stsb,
        name="test_lang_all_stsb",
        main_similarity=SimilarityFunction.COSINE,
    )
    result_lang_all_stsb = test_evaluator(model)

    params["lang_1_test_result_spearman"] = result_lang_1_stsb
    params["lang_2_test_result_spearman"] = result_lang_2_stsb
    params["lang_cross_test_result_spearman"] = result_lang_cross_stsb
    params["lang_all_test_result_spearman"] = result_lang_all_stsb
    params["languages"] = languages

    print("test_results:", params)

    with open(os.path.join(model_dir, "test_results.json"), "w") as outfile:
        json.dump(params, outfile)
            else:
                validation_examples.append(sample)

    train_dataloader = DataLoader(train_examples,
                                  shuffle=True,
                                  batch_size=batch_size)
    return train_dataloader, validation_examples


if __name__ == "__main__":
    args = parser.parse_args()
    model, train_loss = construct_model(args.base_model, args.encoder_style)
    train_dataloader, validation_examples = load_data(
        args.training_data_file, batch_size=args.batch_size)
    if args.encoder_style == BIENCODER:
        evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
            validation_examples)
    else:
        evaluator = CECorrelationEvaluator.from_input_examples(
            validation_examples)
    if args.encoder_style == BIENCODER:
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  epochs=args.num_epochs,
                  warmup_steps=args.warmup_steps,
                  evaluator=evaluator,
                  evaluation_steps=500,
                  optimizer_params={'lr': args.lr},
                  output_path=args.output_training_directory,
                  use_amp=True)
    elif args.encoder_style == CROSSENCODER:
        model.fit(train_dataloader=train_dataloader,
                  evaluator=evaluator,
Esempio n. 9
0
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples('sts-dev.csv'), name='sts-dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(
    len(train_data) * num_epochs / train_batch_size *
    0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))


#Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'dev':
            score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
            dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))

dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')

# Configure the training
num_epochs = 1

warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))



# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
    os.path.join(script_folder_path, args.sts_corpus))
for idx, target in enumerate(target_eval_files):
    output_filename_eval = os.path.join(script_folder_path,
                                        args.sts_corpus + target + "-test.csv")
    if args.whitening:
        evaluators[target[:5]].append(
            WhiteningEmbeddingSimilarityEvaluator.from_input_examples(
                sts_reader.get_examples(output_filename_eval),
                measure_data_num=target_eval_data_num[idx],
                embed_dim=args.embed_dim,
                name=target,
                main_similarity=SimilarityFunction.COSINE))
    else:
        evaluators[target[:5]].append(
            EmbeddingSimilarityEvaluator.from_input_examples(
                sts_reader.get_examples(output_filename_eval),
                name=target,
                main_similarity=SimilarityFunction.COSINE))

all_results = []
logger_text = ""
for task, sequential_evaluator in evaluators.items():
    result = model.evaluate(
        SequentialEvaluator(
            sequential_evaluator,
            main_score_function=lambda scores: np.mean(scores)))
    logger_text += "%.2f \t" % (result * 100)
    all_results.append(result * 100)
logger.info(" \t".join(target_eval_tasks) + " \tOverall.")
logger.info(logger_text + "%.2f" % np.mean(all_results))
    # print(sorted_df.info())
    # print(sorted_df.head())
    # print(sorted_df['Topic'].value_counts())

    train, test = train_test_split(sorted_df, stratify=sorted_df['Topic'])
    test, val = train_test_split(test, stratify=test['Topic'])

    print("Getting the bert-base-nli-mean-tokens model.")
    model = SentenceTransformer("bert-base-nli-mean-tokens")

    print("Read AIML QA dataset")
    train_dataloader = DataLoader(train, shuffle=True, batch_size=train_batch_size)
    print("Calculate loss")
    train_loss = losses.CosineSimilarityLoss(model=model)
    print("Create evaluator")
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val)

    # Train the model
    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
    print("training the model...")
    model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)
    print("complete")


    # Development set: Measure correlation between cosine score and gold labels
    print("evaluating trained model...")
Esempio n. 13
0
    "Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)".
    format(model_name))

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark gold and silver train dataset")
silver_samples = list(InputExample(texts=[data[0], data[1]], label=score) for \
    data, score in zip(silver_data, silver_scores))

train_dataset = SentencesDataset(gold_samples + silver_samples, bi_encoder)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=bi_encoder)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples,
                                                             name='sts-dev')

# Configure the training.
warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the bi-encoder model
bi_encoder.fit(train_objectives=[(train_dataloader, train_loss)],
               evaluator=evaluator,
               epochs=num_epochs,
               evaluation_steps=1000,
               warmup_steps=warmup_steps,
               output_path=bi_encoder_path)

######################################################################
Esempio n. 14
0
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
train_dataset = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)


logging.info("Read STSbenchmark dev dataset")
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples('sts-dev.csv'),
                                                                 batch_size=train_batch_size, name='sts-dev',
                                                                 main_similarity=args.main_similarity)

# Configure the training
num_epochs = 1

warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))



# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
Esempio n. 15
0
script_folder_path = os.path.dirname(os.path.realpath(__file__))

#Limit torch to 4 threads
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = '../training/nli/output/training_nli_bert-base-uncased-2021-01-10_14-44-13'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

sts_corpus = "../datasets/stsbenchmark/" 
target_eval_files = set(['sts','sts12', 'sts13', 'sts14', 'sts15', 'sts16', 'sick-r']) 

evaluators = []         #evaluators has a list of different evaluator classes we call periodically
sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, sts_corpus))
for target in target_eval_files:
	output_filename_eval = os.path.join(script_folder_path,sts_corpus + target + "-test.csv")
	evaluators.append(EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), name=target))

evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores))
model.evaluate(evaluator)
Esempio n. 16
0
print("Number of dissimilar pairs in dev data: ",
      len(dev_pair_corpus) - num_sim_dev_pairs)
print("total validation data size: ", len(dev_pair_corpus))

# modeling
model_variant = args.variant
print("model variant is: ", model_variant)
path = args.output_path
print("output path: ", path)
# word_embedding_model = models.Transformer("roberta-base")
# pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
#                                pooling_mode_mean_tokens=True,
#                                pooling_mode_cls_token=False,
#                                pooling_mode_max_tokens=False)
# model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
model = SentenceTransformer(model_variant)  # fine-tune on top of SBERT
model.to(device)
train_dataloader = DataLoader(train_pair_corpus, shuffle=True, batch_size=32)
train_loss = losses.CosineSimilarityLoss(model=model)
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_pair_corpus,
                                                             name='dev')
warmup_steps = math.ceil(len(train_dataloader) * 4 *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=4,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=path)
Esempio n. 17
0
from sentence_transformers.readers import STSBenchmarkDataReader
import logging
import sys
import os
import torch

script_folder_path = os.path.dirname(os.path.realpath(__file__))

#Limit torch to 4 threads
torch.set_num_threads(4)

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens'

# Load a named sentence model (based on BERT). This will download the model from our server.
# Alternatively, you can also pass a filepath to SentenceTransformer()
model = SentenceTransformer(model_name)

sts_reader = STSBenchmarkDataReader(
    os.path.join(script_folder_path, '../datasets/stsbenchmark'))
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples("sts-test.csv"))

model.evaluate(evaluator)
Esempio n. 18
0
    test_posts_ranking = create_posts_ranking(
        'stackoverflow_data_ranking_v2_testing.json',
        data_dir,
        model,
        validate=None,
        is_test=True)
    test_search = create_search(
        'stackoverflow_matches_codesearchnet_5k_test_collection.tsv',
        'stackoverflow_matches_codesearchnet_5k_test_queries.tsv',
        'stackoverflow_matches_codesearchnet_5k_test_blanca-qidpidtriples.train.tsv',
        data_dir,
        model,
        validate=None,
        is_test=True)

    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        test_hierarchy_samples, name='test-hierarchy-samples')
    test_evaluator(model, output_path=args.output_dir)
    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        test_posts_ranking, name='test-post-ranking')
    test_evaluator(model, output_path=args.output_dir)
    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        test_usage, name='test-usage')
    test_evaluator(model, output_path=args.output_dir)

    test_evaluator = BinaryClassificationEvaluator.from_input_examples(
        test_linked_posts, name='test-linked-posts')
    test_evaluator(model, output_path=args.output_dir)
    test_evaluator = BinaryClassificationEvaluator.from_input_examples(
        test_class_posts, name='test-class-posts')
    test_evaluator(model, output_path=args.output_dir)
    test_evaluator = BinaryClassificationEvaluator.from_input_examples(
Esempio n. 19
0

import math

from sentence_transformers import losses

from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

train_dataloader = DataLoader(train_samples[:100000], shuffle=True, batch_size=train_batch_size)
# train_dataloader = DataLoader(train_samples[:80000], shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=word_embedding_model)


# Development set: Measure correlation between cosine score and gold labels
logging.info("Read SNLI benchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='snli-test')


# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
logging.info("checkpoint_save_steps: {}".format(10*len(train_dataloader)))


# Train the model
word_embedding_model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=10000,
          warmup_steps=warmup_steps,
          output_path=OUTPUT_MODEL,
Esempio n. 20
0
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
# model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'
set_seed(args)
# Read the dataset
train_batch_size = args.batch_size
model_save_path = args.model_path

model = SentenceTransformer(model_save_path)

folder = '../datasets/temp-sts/STS-data'
#'STS2012-gold','STS2013-gold','STS2014-gold','STS2015-gold',
names = [
    'STS2012-gold', 'STS2013-gold', 'STS2014-gold', 'STS2015-gold',
    'STS2016-gold', 'SICK-data'
]

for name in names:

    sts_reader = STSDataReader(os.path.join(folder, name))
    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        sts_reader.get_examples('all.tsv'),
        batch_size=train_batch_size,
        name=name + '-test')
    test_evaluator(model, output_path=model_save_path)
Esempio n. 21
0
def main(args):
    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout
    # Read the dataset
    train_batch_size = 64
    num_epochs = 1000

    if args.pretrained:
        model = SentenceTransformer(args.pretrained)
        model_save_path = os.path.join(
            args.save_path,
            args.pretrained.split("/")[-1] + '-' +
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    else:
        #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
        model_name = 'cl-tohoku/bert-base-japanese-char-whole-word-masking'
        model_save_path = os.path.join(
            args.save_path,
            model_name.replace("/", "-") + '-' +
            datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
        # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
        word_embedding_model = models.Transformer(model_name)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read custom train dataset")

    train_samples = []
    val_samples = []
    inp_list = []
    dataset_path = args.data_path
    with gzip.open(dataset_path, 'rt', encoding='utf8') as fIn:
        reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
        for row in reader:
            score = float(
                row['score']) / 10  # Normalize score to range 0 ... 1
            inp_list.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=score))

    from sklearn.model_selection import train_test_split
    train_samples, val_samples = train_test_split(inp_list, test_size=0.2)
    # import ipdb; ipdb.set_trace()

    train_dataset = SentencesDataset(train_samples, model)
    train_dataloader = DataLoader(train_dataset,
                                  shuffle=True,
                                  batch_size=train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    logging.info("Read custom dev dataset")
    # evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples, name='sts-dev')
    evaluator = EmbeddingSimilarityEvaluator.from_input_examples(val_samples)

    # Configure the training. We skip evaluation in this example
    warmup_steps = math.ceil(
        len(train_dataset) * num_epochs / train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # import ipdb; ipdb.set_trace()
    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=model_save_path)
Esempio n. 22
0
def test(model_save_path, sts_dataset_path, train_batch_size):
    test_samples = read_dataset(sts_dataset_path, "test")
    model = SentenceTransformer(model_save_path)
    test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
        test_samples, batch_size=train_batch_size, name='sts-test')
    test_evaluator(model, output_path=model_save_path)