reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        train_samples_ConstrativeLoss.append(
            InputExample(texts=[row['question1'], row['question2']],
                         label=int(row['is_duplicate'])))
        if row['is_duplicate'] == '1':
            train_samples_MultipleNegativesRankingLoss.append(
                InputExample(texts=[row['question1'], row['question2']],
                             label=1))
            train_samples_MultipleNegativesRankingLoss.append(
                InputExample(texts=[row['question2'], row['question1']],
                             label=1)
            )  # if A is a duplicate of B, then B is a duplicate of A

# Create data loader and loss for MultipleNegativesRankingLoss
train_dataset_MultipleNegativesRankingLoss = SentencesDataset(
    train_samples_MultipleNegativesRankingLoss, model=model)
train_dataloader_MultipleNegativesRankingLoss = DataLoader(
    train_dataset_MultipleNegativesRankingLoss,
    shuffle=True,
    batch_size=train_batch_size)
train_loss_MultipleNegativesRankingLoss = losses.MultipleNegativesRankingLoss(
    model)

# Create data loader and loss for OnlineContrastiveLoss
train_dataset_ConstrativeLoss = SentencesDataset(train_samples_ConstrativeLoss,
                                                 model=model)
train_dataloader_ConstrativeLoss = DataLoader(train_dataset_ConstrativeLoss,
                                              shuffle=True,
                                              batch_size=train_batch_size)
train_loss_ConstrativeLoss = losses.OnlineContrastiveLoss(
    model=model, distance_metric=distance_metric, margin=margin)
Beispiel #2
0
def get_dataset2(data_pars=None, model=None, **kw):
    """
    JSON data_pars to get dataset
    "data_pars":    { "data_path": "dataset/GOOG-year.csv", "data_type": "pandas",
    "size": [0, 0, 6], "output_size": [0, 6] },
    """
    # data_path = path_norm(data_pars["data_path"])

    istrain = data_pars.get("is_train", 0)
    mode = "train" if istrain else "test"
    data_type = data_pars[f"{mode}_type"].lower()

    def get_reader(data_type, path):
        if data_type == 'nli': Reader = readers.NLIDataReader
        elif data_type == 'sts': Reader = readers.STSDataReader
        else:
            Reader = "MyCustomReader()"

        path = os.path.join(path)
        reader = Reader(path)
        return reader

    def get_filename(data_type, mode='test'):
        if mode == 'train':
            fname = 'train.gz' if data_pars["train_type"].lower(
            ) == 'nli' else 'sts-train.csv'

        if mode == 'test':
            fname = 'dev.gz' if data_pars["test_type"].lower(
            ) == 'nli' else 'sts-dev.csv'

        return fname

    log("############ Dataloader setup  #############################")
    train_dataloader = None
    if istrain:
        train_pars = data_pars.copy()
        train_pars.update(train=1)
        train_fname = get_filename(
            data_pars, mode='train'
        )  # 'train.gz' if data_pars["train_type"].lower() == 'nli'else 'sts-train.csv'
        train_reader = get_reader(data_type, data_pars['train_path'])
        train_data = SentencesDataset(train_reader.get_examples(train_fname),
                                      model=model.model)
        train_dataloader = DataLoader(train_data,
                                      shuffle=True,
                                      batch_size=data_pars["batch_size"])

        val_pars = data_pars.copy()
        val_pars.update(train=0)
        val_fname = get_filename(
            data_pars, mode='test'
        )  #'dev.gz' if data_pars["test_type"].lower() == 'nli'  else 'sts-dev.csv'
        val_reader = get_reader(data_type, data_pars['test_path'])
        val_data = SentencesDataset(val_reader.get_examples(val_fname),
                                    model=model.model)
        val_dataloader = DataLoader(val_data,
                                    shuffle=True,
                                    batch_size=data_pars["batch_size"])

        pars = {"train_num_labels": train_reader.get_num_labels()}
        return train_dataloader, val_dataloader, pars

    else:
        #### Inference part
        val_pars = data_pars.copy()
        val_pars.update(train=0)
        val_fname = get_filename(
            data_pars, mode='test'
        )  #'dev.gz' if data_pars["test_type"].lower() == 'nli'  else 'sts-dev.csv'
        val_reader = get_reader(data_type, data_pars['test_path'])

        pars = {
            "train_fname":
            'train.gz'
            if data_pars["train_type"].lower() == 'nli' else 'sts-train.csv'
        }

        return val_reader, pars
# Use BERT for mapping tokens to embeddings
word_embedding_model = models.RoBERTa('roberta-large')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=train_num_labels)

logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(
    examples=sts_reader.get_examples('sts-train-dev.csv'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

# Configure the training
num_epochs = 1
word_embedding_model = models.RoBERTa('roberta-base', do_lower_case=False)
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Load Old Model
# model = SentenceTransformer('checkpoints/sentence_transformers/roberta-base_v7_triplet_epoch1')

triplet_reader = TripletReader('data/v7')

logging.info("Read Train dataset")
train_data = SentencesDataset(
    examples=triplet_reader.get_examples('triplet_train_full.csv'),
    model=model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.TripletLoss(model=model, triplet_margin=1.0)

logging.info("Read Dev dataset")
dev_data = SentencesDataset(
    examples=triplet_reader.get_examples('triplet_dev.csv'), model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
                            batch_size=eval_batch_size)
evaluator = TripletEvaluator(dev_dataloader)

# Train the model
Beispiel #5
0
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        test_samples.append(InputExample(texts=[s1, s2], label=score))

with open('./KorNLUDatasets/KorSTS/tune_train.tsv', 'rt',
          encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        train_samples.append(InputExample(texts=[s1, s2], label=score))

train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples,
                                                             name='sts-dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(
    len(train_dataset) * num_epochs / train_batch_size *
    0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))
Beispiel #6
0
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_nli_samples = []
with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'train':
            label_id = label2int[row['label']]
            train_nli_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=label_id))

train_data_nli = SentencesDataset(train_nli_samples, model=model)
train_dataloader_nli = DataLoader(train_data_nli,
                                  shuffle=True,
                                  batch_size=batch_size)
train_loss_nli = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=len(label2int))

logging.info("Read STSbenchmark train dataset")
train_sts_samples = []
dev_sts_samples = []
test_sts_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
# # Read the dataset
model_save_path = "output2/run3/training_agb_roberta-base-nli-mean-tokens-2020-04-10_15-59-19_og_3/"
batch_size = 52
agb_reader = AGBDataReader('datasets/AGB_og')
train_num_labels = agb_reader.get_num_labels()

# Use RoBERTa for mapping tokens to embeddings
model = SentenceTransformer(model_save_path)

train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=train_num_labels)
train_loss.classifier = torch.load(
    os.path.join(model_save_path, "2_Softmax/pytorch_model.bin"))

print("dev")
test_data = SentencesDataset(examples=agb_reader.get_examples('dev_raw.tsv'),
                             model=model,
                             shorten=True)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
evaluator = LabelAccuracyEvaluator(test_dataloader, softmax_model=train_loss)
model.evaluate(evaluator)
print("test")
test_data = SentencesDataset(examples=agb_reader.get_examples('test_raw.tsv'),
                             model=model,
                             shorten=True)
test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
evaluator = LabelAccuracyEvaluator(test_dataloader, softmax_model=train_loss)
model.evaluate(evaluator)
Beispiel #8
0
#################################################################################################
#
# Step 3: Train bi-encoder model with both (gold + silver) STSbenchmark dataset - Augmented SBERT
#
#################################################################################################

logging.info(
    "Step 3: Train bi-encoder: {} with STSbenchmark (gold + silver dataset)".
    format(model_name))

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark gold and silver train dataset")
silver_samples = list(InputExample(texts=[data[0], data[1]], label=score) for \
    data, score in zip(silver_data, silver_scores))

train_dataset = SentencesDataset(gold_samples + silver_samples, bi_encoder)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=bi_encoder)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples,
                                                             name='sts-dev')

# Configure the training.
warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the bi-encoder model
### Configure sentence transformers for training and train on the provided dataset
# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

logging.info("Read Triplet train dataset")
train_dataset = SentencesDataset(examples=triplet_reader.get_examples(
    'train.csv', max_examples=100000),
                                 model=model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.TripletLoss(model=model)

logging.info("Read Wikipedia Triplet dev dataset")
evaluator = TripletEvaluator.from_input_examples(triplet_reader.get_examples(
    'validation.csv', 1000),
                                                 name='dev')

warmup_steps = int(len(train_dataset) * num_epochs / train_batch_size *
                   0.1)  #10% of train data

# Train the model
Beispiel #10
0
    "/", "-") + '-' + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
train_data = SentencesDataset(ict_reader.get_examples('dev.txt',
                                                      max_examples=6000),
                              model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=train_num_labels)

# load dev evaluator with sentence similarity task sts-dev
# Finetuning Sentence Transformers with inverse cloze task + nips papers (does not seem to improve sentence similarity by much):
# Without finetuning (vanilla bert) : Pearson: 0.5917	Spearman: 0.5932
# With ICT finetuning after 3k steps: Pearson: 0.6574	Spearman: 0.6809
# longer training does not improve the sentence similiarity
logging.info("Read STSbenchmark dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'),
                            model=model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)

# The below all apply to the de example - how does one evaluate the model outside this single example???
###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read data/hindi_sbert_sts_train.csv dataset")
evaluators = []
sts_reader = readers.STSDataReader('./data/',
                                   s1_col_idx=0,
                                   s2_col_idx=1,
                                   score_col_idx=2)
dev_data = SentencesDataset(
    examples=sts_reader.get_examples('hindi_sbert_sts_train.csv'), model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
                            batch_size=train_batch_size)
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(
    dev_dataloader, name='Hindi_Headlines_en_hi_sbert')
evaluators.append(evaluator_sts)

model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluation.SequentialEvaluator(
              evaluators, main_score_function=lambda scores: scores[-1]),
          epochs=20,
          evaluation_steps=1000,
          warmup_steps=10000,
          scheduler='warmupconstant',
          output_path=output_path,
def main():
    parser = argparse.ArgumentParser()

    # Input and output configs
    parser.add_argument("--task",
                        default=None,
                        type=str,
                        required=True,
                        help="the task to run bert ranker for")
    parser.add_argument("--data_folder",
                        default=None,
                        type=str,
                        required=True,
                        help="the folder containing data")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="the folder to output predictions")

    # #Training procedure
    parser.add_argument("--num_epochs",
                        default=5,
                        type=int,
                        required=False,
                        help="Number of epochs for training.")
    parser.add_argument("--train_batch_size",
                        default=8,
                        type=int,
                        required=False,
                        help="Training batch size.")
    # #Model hyperparameters
    parser.add_argument("--transformer_model",
                        default="bert-base-cased",
                        type=str,
                        required=False,
                        help="Bert model to use (default = bert-base-cased).")

    args = parser.parse_args()

    word_embedding_model = models.Transformer(args.transformer_model)

    pooling_model = models.Pooling(
        word_embedding_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=False,
        pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    logging.info("Creating train CRR dataset.")
    crr_reader = CRRBenchmarkDataReader('{}/{}'.format(args.data_folder,
                                                       args.task))

    train_data = SentencesDataset(crr_reader.get_examples("train.tsv"), model)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=args.train_batch_size)
    train_loss = losses.CosineSimilarityLoss(model=model)

    logging.info("Creating dev CRR dataset.")
    dev_data = SentencesDataset(crr_reader.get_examples('valid.tsv'), model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=args.train_batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    warmup_steps = math.ceil(
        len(train_data) * args.num_epochs / args.train_batch_size *
        0.1)  #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    logging.info("Fitting sentenceBERT")
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
              epochs=args.num_epochs,
              evaluation_steps=1000,
              warmup_steps=warmup_steps,
              output_path=args.output_dir +
              "{}_{}".format(args.transformer_model, args.task))
Beispiel #13
0
def grid_search_fine_tune_sbert(train_params,
                                train_sents,
                                train_labels,
                                label_names,
                                eval_classifier=None):
    """
    Find the optimal SBERT model by doing a hyperparameter search over random seeds, dev percentage, and different types of SBERT models
    """
    output_path = train_params["output_path"]
    all_dev_perc = train_params["all_dev_perc"]
    model_names = train_params["model_names"]
    max_num_epochs = train_params["max_num_epochs"]
    baseline = train_params['baseline']
    patience = train_params['patience']
    seeds = train_params['seeds']

    if eval_classifier is None:
        train_params["eval_classifier"] = "SBERT"
    else:
        train_params["eval_classifier"] = eval_classifier.__class__.__name__

    print(
        f"Grid Search Fine tuning parameters:\n{json.dumps(train_params, indent=4)}"
    )

    label2int = dict(zip(label_names, range(len(label_names))))

    for dev_perc in all_dev_perc:
        X_train, X_dev, y_train, y_dev = train_test_split(
            train_sents,
            train_labels,
            test_size=dev_perc,
            stratify=train_labels,
            random_state=100)

        # Load data samples into batches
        train_batch_size = 16
        train_samples = build_data_samples(X_train, label2int, y_train)
        dev_samples = build_data_samples(X_dev, label2int, y_dev)

        for model_name in model_names:
            # Train set config
            model = EarlyStoppingSentenceTransformer(model_name)
            train_dataset = SentencesDataset(train_samples, model=model)
            train_dataloader = DataLoader(train_dataset,
                                          shuffle=True,
                                          batch_size=train_batch_size)

            # Dev set config
            dev_dataset = SentencesDataset(dev_samples, model=model)
            dev_dataloader = DataLoader(dev_dataset,
                                        shuffle=True,
                                        batch_size=train_batch_size)

            # Define the way the loss is computed
            classifier = SoftmaxClassifier(model=model,
                                           sentence_embedding_dimension=model.
                                           get_sentence_embedding_dimension(),
                                           num_labels=len(label2int))
            warmup_steps = math.ceil(
                len(train_dataset) * max_num_epochs / train_batch_size *
                0.1)  # 10% of train data for warm-up

            for seed in seeds:
                set_seeds(seed)
                model_deets = f"{train_params['eval_classifier']}_model={model_name}_test-perc={dev_perc}_seed={seed}"

                # Train the model
                start = time.time()
                dev_evaluator = CustomLabelAccuracyEvaluator(
                    dataloader=dev_dataloader,
                    softmax_model=classifier,
                    name='lae-dev',
                    label_names=label_names,
                    model_hyper_params={
                        'model_name': model_name,
                        'dev_perc': dev_perc,
                        'seed': seed
                    })

                model.fit(
                    train_objectives=[(train_dataloader, classifier)],
                    evaluator=dev_evaluator,
                    epochs=max_num_epochs,
                    evaluation_steps=1000,
                    warmup_steps=warmup_steps,
                    output_path=output_path,
                    model_deets=model_deets,
                    baseline=baseline,
                    patience=patience,
                )

                end = time.time()
                hours, rem = divmod(end - start, 3600)
                minutes, seconds = divmod(rem, 60)
                print(
                    "Time taken for fine-tuning:",
                    "{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes),
                                                    seconds))
examples = []

for topic in topics:
    gold = qrel[topic["number"]].items()
    query = topic["title"].strip()

    for item in gold:
        try:
            doc = db.lookup_docno(item[0])
            examples.append(InputExample(texts=[query, doc], label=item[1]))
        except:
            continue
print("finished")

from torch.utils.data import DataLoader
train_dataset = SentencesDataset(examples, ranker)
train_dl = DataLoader(train_dataset, shuffle=True, batch_size=16)
ranker.fit(train_dataloader=train_dl, epochs=5, output_path="ranker/base")

from tqdm.notebook import tqdm

run = {}
for topic in tqdm(topics):
    number = topic["number"]
    query = topic["title"]

    extracted_ids = [k for k in qrel[number].keys()]

    doc_ids = []
    for id in extracted_ids:
        try:
Beispiel #15
0
cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.
                 get_word_embedding_dimension(),
                 out_channels=256,
                 kernel_sizes=[1, 3, 5])

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(cnn.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = LanguageTransformer(modules=[word_embedding_model, cnn, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read STSbenchmark train dataset")
train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'),
                              model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(
    sts_reader.get_examples('sts-dev.csv'))

# Configure the training
num_epochs = 10
warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
Beispiel #16
0
    # Add two trainable feed-forward networks (DAN)
    sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension(
    )
    dan1 = models.Dense(in_features=sent_embeddings_dimension,
                        out_features=sent_embeddings_dimension)
    dan2 = models.Dense(in_features=sent_embeddings_dimension,
                        out_features=sent_embeddings_dimension)

    model = SentenceTransformer(
        modules=[word_embedding_model, pooling_model, dan1, dan2])

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read AGB train dataset")
    train_data = SentencesDataset(agb_reader.get_examples('train_raw.tsv'),
                                  model=model,
                                  shorten=False)
    train_dataloader = DataLoader(train_data,
                                  shuffle=True,
                                  batch_size=batch_size)
    train_loss = losses.SoftmaxLoss(
        model=model,
        sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
        num_labels=train_num_labels)

    logging.info("Read AGB dev dataset")
    dev_data = SentencesDataset(
        examples=agb_reader.get_examples('dev_raw.tsv'),
        model=model,
        shorten=False)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
    progress.update(1)

progress.reset()
progress.close()
logging.info("Textual augmentation completed....")
logging.info("Number of silver pairs generated: {}".format(
    len(silver_samples)))

###################################################################
#
# Train SBERT model with both (gold + silver) STS benchmark dataset
#
###################################################################

logging.info("Read STSbenchmark (gold + silver) training dataset")
train_dataset = SentencesDataset(gold_samples + silver_samples, model)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples,
                                                             name='sts-dev')

# Configure the training.
warmup_steps = math.ceil(len(train_dataset) * num_epochs / batch_size *
                         0.1)  #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

# Train the SBERT model
Beispiel #18
0
def main(args):
    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout

    #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
    #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'

    model_name = args.model_name
    if model_name is None:
        model_name = 'bert-base-chinese'

    # Read the dataset
    batch_size = args.batch_size

    model_output_dir = args.model_output_dir
    #model_save_path = os.path.join(model_output_dir, "bert-base", datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
    model_save_path = model_output_dir

    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    # word_embedding_model = models.Transformer(model_name)
    if args.init_model is None:
        word_embedding_model = models.Transformer(model_name)

        # Apply mean pooling to get one fixed sized sentence vector
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension(),
            pooling_mode_mean_tokens=True,
            pooling_mode_cls_token=False,
            pooling_mode_max_tokens=False)

        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
    else:
        model = SentenceTransformer(args.init_model)

    if args.do_train == 1:
        # Convert the dataset to a DataLoader ready for training
        data_reader = SimTextDataReader()
        logging.info("train_data:%s" % (args.train_data))
        logging.info("cache_data:%s" % (args.cached_data))
        train_data_files = args.train_data.split('#')
        cached_data_file = args.cached_data
        logging.info("Read train dataset")
        if not os.path.isfile(cached_data_file):
            train_examples = []
            for train_file in train_data_files:
                if os.path.isfile(train_file):
                    logging.info("load train file:%s" % (train_file))
                    now_examples = data_reader.get_examples(train_file)
                    train_examples.extend(now_examples)

            train_data = SentencesDataset(train_examples, model=model)
            torch.save(train_data, args.cached_data)
        else:
            train_data = torch.load(cached_data_file)
            logging.info("Load cached dataset %s" % (cached_data_file))
        logging.info("Build train dataset")
        train_dataloader = DataLoader(train_data,
                                      shuffle=True,
                                      batch_size=batch_size)
        # train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
        train_loss = losses.CosineSimilarityLoss(model=model)

        logging.info("Read dev dataset")
        dev_data_files = args.dev_data.split('#')
        dev_examples = []
        for dev_file in dev_data_files:
            if os.path.isfile(dev_file):
                logging.info("load dev file:%s" % (dev_file))
                now_examples = data_reader.get_examples(dev_file)
                dev_examples.extend(now_examples)
        dev_data = SentencesDataset(examples=dev_examples, model=model)
        logging.info("Build dev dataset")
        dev_dataloader = DataLoader(dev_data,
                                    shuffle=False,
                                    batch_size=batch_size)
        evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

        # Configure the training
        num_epochs = args.num_epochs
        warmup_steps = math.ceil(
            len(train_dataloader) * num_epochs / batch_size *
            0.1)  #10% of train data for warm-up
        logging.info("Warmup-steps: {}".format(warmup_steps))

        logging.info("Start training")
        # Train the model
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=evaluator,
                  epochs=num_epochs,
                  evaluation_steps=1000,
                  warmup_steps=warmup_steps,
                  output_path=model_save_path)

    if args.do_predict == 1:
        logging.info("Read predict dataset")
        pred_data_file = args.pred_data
        output_file = os.path.join(args.model_output_dir, "pred_res")
        text_pairs = load_pred_data(pred_data_file)
        with open(output_file, "w", encoding="utf-8") as fp:
            for tpair in text_pairs:
                embedding_pair = model.encode(tpair)
                cos_sim = cosine_similarity(embedding_pair[0],
                                            embedding_pair[1])
                fp.write("%s\t%s\t%f\n" % (tpair[0], tpair[1], cos_sim))
from sentence_transformers import SentenceTransformer, LossFunction, TrainConfig, SentencesDataset, LoggingHandler, EmbeddingSimilarityEvaluator, EmbeddingSimilarity
from sentence_transformers.dataset_readers import STSDataReader
import numpy as np
import logging
import sys

#### Just some code to print debug information to stdout
np.set_printoptions(threshold=100)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

# Load Sentence model (based on BERT) from URL
model = SentenceTransformer(sys.argv[1])

sts_reader = STSDataReader('datasets/stsbenchmark')

test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"),
                             model=model)
test_dataloader = DataLoader(test_data,
                             shuffle=False,
                             batch_size=8,
                             collate_fn=model.smart_batching_collate())
evaluator = EmbeddingSimilarityEvaluator(test_dataloader,
                                         EmbeddingSimilarity.COSINE)

model.evaluate(evaluator)
Beispiel #20
0
###### Read Dataset ######
train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
for train_file in train_files:
    train_data.load_data(train_file)

train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
train_loss = losses.MSELoss(model=model)


###### Load dev sets ######

# Test on STS 2017.en-de dataset using Spearman rank correlation
logging.info("Read STS2017.en-de dataset")
evaluators = []
sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2)
dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='STS2017.en-de')
evaluators.append(evaluator_sts)


# Use XLNI.en-de dataset with MSE evaluation
logging.info("Read XNLI.en-de dataset")
xnli_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
xnli_reader.load_data('../datasets/xnli-en-de.txt.gz')

xnli_dataloader = DataLoader(xnli_reader, shuffle=False, batch_size=train_batch_size)
xnli_mse = evaluation.MSEEvaluator(xnli_dataloader, name='xnli-en-de')
evaluators.append(xnli_mse)

Beispiel #21
0
from sentence_transformers import SentencesDataset, SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_transformers.readers import InputExample
from torch.utils.data import DataLoader
from transformers.data.processors.glue import MrpcProcessor

processor = MrpcProcessor()

model = SentenceTransformer('bert-base-nli-mean-tokens')
train_examples = processor.get_train_examples("/data/misc/cc/1_extraction/1.1_squad_classification/glue_data/MRPC")
train_examples = [InputExample(ie.guid, [ie.text_a, ie.text_b], float(ie.label)) for ie in train_examples]
train_data = SentencesDataset(train_examples, model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=2)
train_loss = losses.CosineSimilarityLoss(model=model)

dev_examples = processor.get_dev_examples("/data/misc/cc/1_extraction/1.1_squad_classification/glue_data/MRPC")
dev_examples = [InputExample(ie.guid, [ie.text_a, ie.text_b], float(ie.label)) for ie in dev_examples]
dev_data = SentencesDataset(dev_examples, model)
dev_dataloader = DataLoader(train_data, shuffle=True, batch_size=2)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=5,
          evaluation_steps=1000,
          warmup_steps=500,
          output_path="out")

print("done")
### Configure sentence transformers for training and train on the provided dataset
# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformers(
    model_name_or_path='bert-base-uncased', model_type='bert')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

logging.info("Read Triplet train dataset")
train_data = SentencesDataset(
    examples=triplet_reader.get_examples('train.csv'), model=model)
train_dataloader = DataLoader(train_data,
                              shuffle=True,
                              batch_size=train_batch_size)
train_loss = losses.TripletLoss(model=model)

logging.info("Read Wikipedia Triplet dev dataset")
dev_data = SentencesDataset(examples=triplet_reader.get_examples(
    'validation.csv', 1000),
                            model=model)
dev_dataloader = DataLoader(dev_data,
                            shuffle=False,
                            batch_size=train_batch_size)
evaluator = TripletEvaluator(dev_dataloader)

warmup_steps = int(len(train_data) * num_epochs / train_batch_size *
def BertEM(path_train, path_valid, path_test, path_error,epochs_num, warmup_steps_num, evaluation_steps_num):
    #实例化进度条
    bar = progressbar
    #定义模型
    data_type = {"text_a": str, "text_b": str}
    train_data = pd.read_csv(path_train, encoding='utf-8',dtype=data_type)
    valid_data = pd.read_csv(path_valid, encoding='utf-8',dtype=data_type)
    test_data = pd.read_csv(path_test, encoding='utf-8',dtype=data_type)

    #训练集
    train_examples = []
    for i in bar.progressbar(range(len(train_data))):
        time.sleep(0.0001)
        text_a = train_data.iloc[i]['text_a']
        text_b = train_data.iloc[i]['text_b']
        text_a = str(text_a)
        text_b = str(text_b)
        label_data = train_data.iloc[i]['label']
        label_data = float(label_data)
        train_examples.append(InputExample(texts=[text_a,text_b], label=label_data))
    print(InputExample)

    #验证集
    sentence_a = []
    sentence_b = []
    label_valid = []
    for i in bar.progressbar(range(len(valid_data))):
        time.sleep(0.0001)
        sentence1 = valid_data.iloc[i]['text_a']
        sentence2 = valid_data.iloc[i]['text_b']
        label_valid_t = valid_data.iloc[i]['label']
        label_valid_t = float(label_valid_t)
        sentence_a.append(sentence1)
        sentence_b.append(sentence2)
        label_valid.append(label_valid_t)
    #定义评估器
    #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentence_a, sentence_b, label_valid)
    evaluator = evaluation.BinaryClassificationEvaluator(sentence_a, sentence_b, label_valid)
    #定义数据集,损失函数
    train_dataset = SentencesDataset(train_examples, model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=64)
    train_loss = losses.CosineSimilarityLoss(model)

    #计算时间
    start_time = time.clock()
    #训练模型
    model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=epochs_num, warmup_steps=warmup_steps_num,evaluator=evaluator, evaluation_steps=evaluation_steps_num, use_amp=True)
    end_time = time.clock()

    #=========================================评估过程===================================================
    #读取并把test所有属性转化成str
    test_data = pd.read_csv(path_test, encoding='utf-8')
    test_data['text_a'] = test_data['text_a'].map(lambda x: str(x))
    test_data['text_b'] = test_data['text_b'].map(lambda x: str(x))

    #循环创建预测的list字典
    list_num = 40
    prefix = 'pred_list_'
    test_map = {prefix + str(i): [] for i in range(list_num)}
    label_list = []
    score = 0.20
    error_csv = pd.DataFrame(columns=('id','text_a','text_b','cos_scores'))
    #进入测试集测试
    for i in bar.progressbar(range(len(test_data))):
        time.sleep(0.0001)
        text_a_embedding = model.encode(test_data.iloc[i]['text_a'], convert_to_tensor=True)
        text_b_embedding = model.encode(test_data.iloc[i]['text_b'], convert_to_tensor=True)
        extend_score = 0
        cos_scores = util.pytorch_cos_sim(text_a_embedding, text_b_embedding)[0]
        cos_scores = cos_scores.cpu()
        cos_scores = cos_scores + extend_score
        #标签list
        label = test_data.iloc[i]['label']
        label_list.append(int(label))
        #记录下错误的数据
        if cos_scores >= 0.70:
            pred_test = 1
        else:
            pred_test = 0 
        if pred_test != label:
            error_text_a = test_data.iloc[i]['text_a']
            error_text_b = test_data.iloc[i]['text_b']
            error_cos_scores = cos_scores   
            error_csv = error_csv.append(pd.DataFrame({'id':[i],'text_a':[error_text_a],'text_b':[error_text_b],'cos_scores':[error_cos_scores]}),ignore_index=True) 
        #生成预测list
        compute_pred(score, cos_scores, prefix, test_map)

    error_csv.to_csv(path_error, index=0)
    max_f1 = 0
    target_threshold = 0.01
    target_precision = 0.01
    target_recall = 0.01
    threshold = 0.20
    #循环输出各种得分结果
    for i in range(len(test_map.keys())):
        #循环计算得分
        precision, recall, f1 = compute_score(label_list, test_map[prefix + str(i)])
        if f1 >= max_f1:
            max_f1 = f1
            target_threshold = threshold
            target_precision = precision
            target_recall = recall
        print('The score > {} result is precision: {}, | recall:{}, | f1: {}'.format(round(threshold,2), precision, recall, f1))
        threshold += 0.02
    #输出所有结果
    print('================dataset_name==================',path_a)
    print('================threshold:{}, target_precision:{}, target_recall:{}, max_f1:{}'.format(target_threshold, target_precision, target_recall, max_f1))
    print('================train_time:{}'.format(str(end_time-start_time)))
Beispiel #24
0
# Use BERT for mapping tokens to embeddings
word_embedding_model = models.Transformer(model_name)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


# Convert the dataset to a DataLoader ready for training
logging.info("Read AllNLI train dataset")
train_data_nli = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
train_dataloader_nli = DataLoader(train_data_nli, shuffle=True, batch_size=batch_size)
train_loss_nli = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)

logging.info("Read STSbenchmark train dataset")
train_data_sts = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
train_dataloader_sts = DataLoader(train_data_sts, shuffle=True, batch_size=batch_size)
train_loss_sts = losses.CosineSimilarityLoss(model=model)


logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples('sts-dev.csv'), name='sts-dev')

# Configure the training
num_epochs = 4
Beispiel #25
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:
        def print_pass(*args):
            pass
        builtins.print = print_pass

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                world_size=args.world_size, rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.arch))
    model = moco.builder.SB_MoCo(args.moco_dim, args.moco_k, args.moco_m, args.moco_t, args.mlp)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)

        # comment out the following line for debugging
        # raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        pass
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        # raise NotImplementedError("Only DistributedDataParallel is supported.")

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(), args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True
    

   
    SB = SentenceTransformer(''bert-base-nli-mean-tokens'')

    dataReader = DataReader.DataReader('csv')

    train_dataset = SentencesDataset(dataReader.get_examples('positive_pair.csv'),model=SB)




    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
    else:
        train_sampler = None

    fn = SB.smart_batching_collate()

    train_loader = torch.utils.data.DataLoader(
        train_dataset,  # The training samples.
        sampler=train_sampler,  # Select batches randomly
        batch_size=args.batch_size,# Trains with this batch size.
        num_workers=args.workers,
        drop_last=True,
        collate_fn = fn
    )

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0 ):
            save_checkpoint({
                'epoch': epoch + 1,
                'arch': args.arch,
                'state_dict': model.state_dict(),
                'optimizer' : optimizer.state_dict(),
                }, is_best=False, filename='./checkpoints/moco_{:03d}.pth.tar'.format(epoch))
# Use BERT for mapping tokens to embeddings
word_embedding_model = models.BERT('bert-base-uncased')

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(
    word_embedding_model.get_word_embedding_dimension(),
    pooling_mode_mean_tokens=True,
    pooling_mode_cls_token=False,
    pooling_mode_max_tokens=False)

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# Convert the dataset to a DataLoader ready for training
logging.info("Read ParaBank train dataset")
train_data = SentencesDataset(nli_reader.get_examples('train.tsv'),
                              model=model)
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
train_loss = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=train_num_labels)

logging.info("Read ParaBank dev dataset")
dev_data = SentencesDataset(examples=sts_reader.get_examples('dev.tsv'),
                            model=model)
dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

# Configure the training
num_epochs = 1
Beispiel #27
0
 def _dataloader_from_examples(examples,
                               model,
                               batch_size=8,
                               shuffle=False):
     train_data = SentencesDataset(examples, model, show_progress_bar=True)
     return DataLoader(train_data, shuffle=shuffle, batch_size=batch_size)
Beispiel #28
0
#
# Step 3: Train bi-encoder (SBERT) model with QQP dataset - Augmented SBERT
#
###########################################################################

logging.info(
    "Step 3: Train bi-encoder: {} over labeled QQP (target dataset)".format(
        model_name))

# Convert the dataset to a DataLoader ready for training
logging.info("Loading BERT labeled QQP dataset")
qqp_train_data = list(
    InputExample(texts=[data[0], data[1]], label=score)
    for (data, score) in zip(silver_data, binary_silver_scores))

train_dataset = SentencesDataset(qqp_train_data, bi_encoder)
train_dataloader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=batch_size)
train_loss = losses.MultipleNegativesRankingLoss(bi_encoder)

###### Classification ######
# Given (quesiton1, question2), is this a duplicate or not?
# The evaluator will compute the embeddings for both questions and then compute
# a cosine similarity. If the similarity is above a threshold, we have a duplicate.
logging.info("Read QQP dev dataset")

dev_sentences1 = []
dev_sentences2 = []
dev_labels = []
    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    transformers_cache_dir = os.path.join(os.path.dirname(__file__), '..', '..', 'cache', 'transformers')
    word_embedding_model = models.Transformer(model_name, cache_dir=transformers_cache_dir)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=args.pooling == 'mean',
                                   pooling_mode_cls_token=args.pooling == 'cls',
                                   pooling_mode_max_tokens=args.pooling == 'max')

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # Convert the dataset to a DataLoader ready for training
    logging.info("Read CMNLI train dataset")
    train_data = SentencesDataset(cmnli_reader.get_examples('train'), model=model)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
                                    num_labels=train_num_labels)

    logging.info("Read CSTSbenchmark dev dataset")
    dev_data = SentencesDataset(examples=csts_reader.get_examples('cnsd-sts-dev.txt'), model=model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    warmup_steps = math.ceil(len(train_dataloader) * num_epochs / batch_size * 0.1)  # 10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
              evaluator=evaluator,
postprocessor = PolishSubstituer(
    '/home/xstefan3/arqmath/ARQMath_CLEF2020/Collection/formula_prefix.V0.2.tsv'
)
postproc_questions = list(
    postprocessor.process_questions(dr.post_parser.map_questions))

all_examples = list(examples_from_questions_tup(postproc_questions))
# all_examples = list(examples_from_questions_tup(dr.post_parser.map_questions))
examples_len = len(all_examples)

train_dev_test_split = (int(0.8 * examples_len), int(0.9 * examples_len))

# single-time preprocessing support
train_data = SentencesDataset(all_examples[:train_dev_test_split[0]],
                              model,
                              show_progress_bar=True)
pickle.dump(train_data, open("train_data.pkl", "wb"))
# train_data = pickle.load(open("train_data.pkl", "rb"))

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)

dev_data = SentencesDataset(
    all_examples[train_dev_test_split[0]:train_dev_test_split[1]],
    model,
    show_progress_bar=True)
dev_sampler = RandomSampler(dev_data, replacement=True, num_samples=2000)
dev_loader = DataLoader(dev_data, batch_size=16, sampler=dev_sampler)

train_loss = losses.CosineSimilarityLoss(model=model)
evaluator = EmbeddingSimilarityEvaluator(dev_loader,