Ejemplo n.º 1
0
    def test_LabelAccuracyEvaluator(self):
        """Tests that the LabelAccuracyEvaluator can be loaded correctly"""
        model = SentenceTransformer('paraphrase-distilroberta-base-v1')

        nli_dataset_path = 'datasets/AllNLI.tsv.gz'
        if not os.path.exists(nli_dataset_path):
            util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz',
                          nli_dataset_path)

        label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
        dev_samples = []
        with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
            reader = csv.DictReader(fIn,
                                    delimiter='\t',
                                    quoting=csv.QUOTE_NONE)
            for row in reader:
                if row['split'] == 'train':
                    label_id = label2int[row['label']]
                    dev_samples.append(
                        InputExample(
                            texts=[row['sentence1'], row['sentence2']],
                            label=label_id))
                    if len(dev_samples) >= 100:
                        break

        train_loss = losses.SoftmaxLoss(model=model,
                                        sentence_embedding_dimension=model.
                                        get_sentence_embedding_dimension(),
                                        num_labels=len(label2int))

        dev_dataloader = DataLoader(dev_samples, shuffle=False, batch_size=16)
        evaluator = evaluation.LabelAccuracyEvaluator(dev_dataloader,
                                                      softmax_model=train_loss)
        acc = evaluator(model)
        assert acc > 0.2
    def test_multiclass(self):
        transformer = models.Transformer('prajjwal1/bert-tiny')
        model = SentenceTransformer(modules=[
            transformer,
            models.Pooling(transformer.get_word_embedding_dimension())
        ])
        softmax_loss = losses.SoftmaxLoss(
            model, transformer.get_word_embedding_dimension(), num_labels=3)

        samples = [
            InputExample(texts=[
                "Hello Word, a first test sentence",
                "Hello Word, a other test sentence"
            ],
                         label=0),
            InputExample(texts=[
                "Hello Word, a second test sentence",
                "Hello Word, a other test sentence"
            ],
                         label=1),
            InputExample(texts=[
                "Hello Word, a third test sentence",
                "Hello Word, a other test sentence"
            ],
                         label=2)
        ]
        dataloader = DataLoader(samples, batch_size=1)
        evaluator = MulticlassEvaluator(dataloader, softmax_model=softmax_loss)
        result = evaluator(model)

        i = 0
Ejemplo n.º 3
0
def train(hp):
    """Train the advanced blocking model
    Store the trained model in hp.model_fn.

    Args:
        hp (Namespace): the hyperparameters

    Returns:
        None
    """
    # define model
    model_names = {'distilbert': 'distilbert-base-uncased',
                   'bert': 'bert-base-uncased',
                   'albert': 'albert-base-v2' }

    word_embedding_model = models.Transformer(model_names[hp.lm])
    pooling_model = models.Pooling(word_embedding_model\
                                   .get_word_embedding_dimension(),
				   pooling_mode_mean_tokens=True,
				   pooling_mode_cls_token=False,
				   pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    # load the training and validation data
    reader = Reader()
    trainset = SentencesDataset(examples=reader.get_examples(hp.train_fn),
                                model=model)
    train_dataloader = DataLoader(trainset,
                                  shuffle=True,
                                  batch_size=hp.batch_size)
    train_loss = losses.SoftmaxLoss(model=model,
            sentence_embedding_dimension=model\
                    .get_sentence_embedding_dimension(),
            num_labels=2)

    dev_data = SentencesDataset(examples=reader\
                                         .get_examples(hp.valid_fn),
                                model=model)
    dev_dataloader = DataLoader(dev_data,
                                shuffle=False,
                                batch_size=hp.batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    warmup_steps = math.ceil(len(train_dataloader) \
            * hp.n_epochs / hp.batch_size * 0.1) #10% of train data for warm-up

    if os.path.exists(hp.model_fn):
        import shutil
        shutil.rmtree(hp.model_fn)

    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=hp.n_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=hp.model_fn,
          fp16=hp.fp16,
          fp16_opt_level='O2')
Ejemplo n.º 4
0
 def load_model(self, text_model_path, classifier_path):
     """
     Method used for pretrained model loading
     """
     self.model = SentenceTransformer(text_model_path)
     self.classification_model = torch.load(classifier_path)
     self.train_loss_nli = losses.SoftmaxLoss(
         model=self.model,
         sentence_embedding_dimension=self.model.
         get_sentence_embedding_dimension(),
         num_labels=len(self.label2int))
     self.train_loss_nli.classifier = self.classification_model
Ejemplo n.º 5
0
def train_sbert(model_name, model_save_path):
    batch_size = 16
    nli_reader, sts_reader = load_dataset()
    train_num_labels = nli_reader.get_num_labels()
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.BERT(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    # Convert the dataset to a DataLoader ready for training
    logging.info("Read AllNLI train dataset")
    train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
    train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
    train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)

    logging.info("Read STSbenchmark dev dataset")
    dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
    evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)

    # Configure the training
    num_epochs = 1

    warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))
    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=1000,
            warmup_steps=warmup_steps,
            output_path=model_save_path
            )

    model = SentenceTransformer(model_save_path)
    test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
    test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
    evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

    model.evaluate(evaluator)
Ejemplo n.º 6
0
 def initialize_model(self):
     # Read the dataset
     # Use BERT for mapping tokens to embeddings
     word_embedding_model = models.Transformer(self.base_model,
                                               max_seq_length=128)
     # Apply mean pooling to get one fixed sized sentence vector
     pooling_model = models.Pooling(
         word_embedding_model.get_word_embedding_dimension(),
         pooling_mode_mean_tokens=True,
         pooling_mode_cls_token=False,
         pooling_mode_max_tokens=False)
     self.model = SentenceTransformer(
         modules=[word_embedding_model, pooling_model])
     self.train_loss_nli = losses.SoftmaxLoss(
         model=self.model,
         sentence_embedding_dimension=self.model.
         get_sentence_embedding_dimension(),
         num_labels=len(self.label2int))
Ejemplo n.º 7
0
def main():

  args = parser.parse_args()
  model = build_model()

  train_loss = losses.SoftmaxLoss(
      model=model,
      sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
      num_labels=NUM_LABELS)

  score_map = collections.defaultdict(lambda : collections.defaultdict())
  with open(args.inputdir + "/scores.pickle", 'rb') as f:
    scores = pickle.load(f)

  for key, score_list in scores.items():
    dataset, pair_index = key
    if 'test' in dataset:
      continue
    for rev_i, score in enumerate(score_list):
      score_map[key][rev_i] = score

  dev_samples = sum([
    build_samples(args.inputdir, "traindev_dev", i, score_map)
    for i in range(6)
    ], [])
  dev_evaluator = BasicEvaluator.from_input_examples(
      dev_samples, model, batch_size=TRAIN_BATCH_SIZE, name='sts-dev')

  for epoch_i in range(num_epochs):
    num_examples = len(glob.glob(args.inputdir +"/traindev_train/*")) - 2
    num_examples = 20
    for example_i in range(num_examples):
      train_loader = build_dataloader(args.inputdir, "traindev_train",
          example_i, score_map, TRAIN_BATCH_SIZE)
      warmup_steps = math.ceil(len(train_loader) *
                               0.1)  #10% of train data for warm-up
      model.fit(train_objectives=[(train_loader, train_loss)],
                evaluator=dev_evaluator,
                epochs=1,
                evaluation_steps=1000,
                warmup_steps=warmup_steps,
                output_path=model_save_path)
      for input_ids, labels in train_loader:
        print(train_loss(input_ids, None))
    def test_train_nli(self):
        word_embedding_model = models.Transformer('distilbert-base-uncased')
        pooling_model = models.Pooling(
            word_embedding_model.get_word_embedding_dimension())
        model = SentenceTransformer(
            modules=[word_embedding_model, pooling_model])
        train_dataset = SentencesDataset(self.nli_train_samples, model=model)
        train_dataloader = DataLoader(train_dataset,
                                      shuffle=True,
                                      batch_size=16)
        train_loss = losses.SoftmaxLoss(model=model,
                                        sentence_embedding_dimension=model.
                                        get_sentence_embedding_dimension(),
                                        num_labels=3)
        model.fit(train_objectives=[(train_dataloader, train_loss)],
                  evaluator=None,
                  epochs=1,
                  warmup_steps=int(len(train_dataloader) * 0.1),
                  use_amp=True)

        self.evaluate_stsb_test(model, 50.0)
Ejemplo n.º 9
0
def fine_tune(cfg):
    """
    Function to finetune a model with Infodemic-specific data.

    :param cfg: configuration dictionary
    :return: none
    """
    model = SentenceTransformer(cfg['model'])
    # data reading dependent on data format, see https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark_continue_training.py
    # for an example at lines 48-62
    train_samples = None
    train_ds = SentencesDataset(train_samples, model)
    train_dl = DataLoader(train_ds)
    train_loss = losses.SoftmaxLoss(model, num_labels=3)

    evaluator = None  # list of evaluators at https://github.com/UKPLab/sentence-transformers/tree/master/sentence_transformers/evaluation
    model.fit(train_objectives=[(train_dl, train_loss)],
              evaluator=evaluator,
              epochs=30,
              evaluation_steps=1000,
              output_path=cfg['model_output'])
Ejemplo n.º 10
0
        if os.path.isfile(labels_file):
            os.remove(os.path.join(curr_dir, "prediction_labels.csv"))
        if os.path.isfile(pred_file):
            os.remove(os.path.join(curr_dir, "prediction_results.csv"))

        # Model path
        model_save_path = curr_dir
        batch_size = 24
        agb_reader = TestAGBReader('datasets/og-test')
        train_num_labels = agb_reader.get_num_labels()

        model = SentenceTransformer(model_save_path, device="cpu")

        train_loss = losses.SoftmaxLoss(model=model,
                                        sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
                                        num_labels=train_num_labels)
        train_loss.classifier = torch.load(os.path.join(model_save_path, "2_Softmax/pytorch_model.bin"))

        print("test")
        test_dir = "/data/daumiller/sentence-transformers/examples/datasets/og-test"
        for fn in sorted(os.listdir(test_dir)):
            examples = agb_reader.get_examples(fn)
            if not examples:
                continue
            # Hack to avoid problems with docs almost as long as batch size
            if len(examples) == batch_size + 1:
                batch_size_used = batch_size - 3
            else:
                batch_size_used = batch_size
            test_data = SentencesDataset(examples=examples, model=model, shorten=True)
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(description='Start training with SBERT')
    parser.add_argument('--model_path',
                    type=str,
                    help='Path to trained model folder ./models/[MODEL_NAME]')
    parser.add_argument('--dataset',
                    type=str,
                    default='few_rel',
                    help='Name dataset')  
    parser.add_argument('--mask_method',
                    type=str,
                    default='bracket',
                    help='Type of masking')    
    parser.add_argument('--num_epochs',
                    type=int,
                    default=15,
                    help='Number epochs')                                
    parser.add_argument('--num_samples',
                    type=int,
                    default=-1,
                    help='Number of samples for test run, default -1 means all data')
    parser.add_argument('--max_seq_length',
                    type=int,
                    default=256,
                    help='Max token length for BERT')
    args = parser.parse_args()

    model_path = args.model_path
    dataset = args.dataset
    mask_method = args.mask_method
    num_samples = args.num_samples
    max_seq_length=args.max_seq_length
    num_epochs = args.num_epochs
    evaluation_steps = 1000 # Frequency of evaluation results
    warmup_steps = 1000 # warm up steps
    sentence_out_embedding_dimension = 256

    if model_path.endswith('/'):
        model_path = model_path[:-1]
    model_name = model_path.split('/')[-1]

    path_train_data = f'./data/train_samples/{dataset}_train_{mask_method}_train.csv'
    path_eval_data = f'./data/train_samples/{dataset}_val_{mask_method}_test.csv'
    if num_samples>0:
        model_save_path = f'./trained_models/{model_name}_sbert_bi_{dataset}_test/'
    else:
        model_save_path = f'./trained_models/{model_name}_sbert_bi_{dataset}/'
    ### Define the model
    word_embedding_model = models.Transformer(model_path, max_seq_length=max_seq_length)

    ### Add special tokens - this helps us add tokens like Doc or query or Entity1 / Entity2 
    # but in our case we already added that to the model prior
    #tokens = ["[DOC]", "[QRY]"]
    #word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
    #word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    dense_model = models.Dense(in_features=pooling_model.get_sentence_embedding_dimension(), 
                        out_features=sentence_out_embedding_dimension, activation_function=nn.Tanh())
    # Model pipeline
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dense_model])

    # Prep DataLoader
    train_examples = load_train_sbert(path_train_data, num_samples)
    train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

    # Prep Evaluator
    sentences1, sentences2, scores = load_eval_sbert(path_eval_data, num_samples)
    #evaluator = evaluation.EmbeddingSimilarityEvaluator(sentences1, sentences2, scores)
    evaluator = evaluation.BinaryClassificationEvaluator(sentences1, sentences2, scores)
    #train_loss = losses.CosineSimilarityLoss(model)
    train_loss = losses.SoftmaxLoss(model, sentence_embedding_dimension= sentence_out_embedding_dimension, num_labels = 2)

    #Tune the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=evaluation_steps,
            warmup_steps=warmup_steps,
            output_path=model_save_path)
def trainer(
    model: SBERTPredictor,
    tokenizer,
    df_train,
    df_val,
    epochs: int = 1,
    learning_rate: float = 1e-5,
    batch_size: int = 16,
    embedding_epochs: int = None,
    enable_class_weights: bool = True,
):
    """Train the SBERT model using a training data loader and a validation dataloader.

    :param model: SBERTPredicor model
    :type model: SBERT_Predictor
    :param tokenizer: tokenizer used in SBERT model
    :param df_train: train dataframe
    :type train_dataloader: pd.DataFrame()
    :param df_val: validation dataframe
    :type df_val: pd.DataFrame()
    :param epochs: numer of epochs
    :type epochs: int
    :param learning_rate: learning rate
    :type learning_rate: float
    :param batch_size: batch size to be used for training
    :type batch_size: int
    """
    if embedding_epochs is None:
        embedding_epochs = epochs
    nli_reader = NLIDataReader(df_train.append(df_val))
    train_num_labels = nli_reader.get_num_labels()

    train_data = SentencesDataset(nli_reader.get_examples(),
                                  model=model.embedding_model)
    train_data.label_type = torch.long
    # some bug in sentence_transformer library causes it to be identified as
    # float by default
    train_dataloader_embed = DataLoader(train_data,
                                        shuffle=True,
                                        batch_size=batch_size)
    train_loss_embed = losses.SoftmaxLoss(
        model=model.embedding_model,
        sentence_embedding_dimension=model.embedding_model.
        get_sentence_embedding_dimension(),
        num_labels=train_num_labels)

    val_nli_reader = NLIDataReader(df_val)
    dev_data = SentencesDataset(val_nli_reader.get_examples(),
                                model=model.embedding_model)
    dev_data.label_type = torch.long
    evaluator = EmbeddingSimilarityEvaluator(
        sentences1=df_val["sentence1"].values,
        sentences2=df_val["sentence2"].values,
        scores=df_val["label"].values / 2.,
        batch_size=batch_size)
    warmup_steps = math.ceil(
        len(train_dataloader_embed) * epochs / batch_size * 0.1)
    # 10% of train data for warm-up

    # now to train the final layer
    train_dataset = ClassifierDataset(df_train, tokenizer=tokenizer)
    val_dataset = ClassifierDataset(df_val, tokenizer=tokenizer)
    if enable_class_weights is False:
        class_weights = None
    else:
        class_weights = train_dataset.class_weights()

    train_dataloader = DataLoader(dataset=train_dataset,
                                  batch_size=batch_size,
                                  collate_fn=collate_fn,
                                  shuffle=True)
    val_dataloader = DataLoader(dataset=val_dataset,
                                batch_size=1,
                                collate_fn=collate_fn)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    model.to(device)

    print("------TRAINING STARTS----------")  # noqa: T001
    # train embedding layer
    unfreeze_layer(model.embedding_model)
    model.embedding_model.fit(
        train_objectives=[(train_dataloader_embed, train_loss_embed)],
        evaluator=evaluator,
        epochs=1,
        evaluation_steps=1000,
        warmup_steps=warmup_steps,
    )  # train the Transformer layer
    freeze_layer(model.embedding_model)
    x, y = format_create(df=df_train, model=model)
    x_test, y_test = format_create(df=df_val, model=model)
    if model.logistic_model is True:
        model.logisticregression.fit(x, y)
        print(
            classification_report(
                y_test,
                model.logisticregression.predict(x_test)))  # noqa: T001
    else:
        accuracy_stats = {
            "train": [],
            "val": [],
        }
        loss_stats = {
            "train": [],
            "val": [],
        }

        for e in range(epochs):
            train_epoch_loss = 0
            train_epoch_acc = 0
            model.train()
            for sentence1, sentence2, label in tqdm(train_dataloader):
                label = label.to(device)
                optimizer.zero_grad()
                y_train_pred = model(sentence1, sentence2)

                train_loss = criterion(y_train_pred, label)
                train_acc = multi_acc(y_train_pred, label)

                train_loss.backward()
                optimizer.step()

                train_epoch_loss += train_loss.item()
                train_epoch_acc += train_acc.item()

            # VALIDATION
            with torch.no_grad():

                val_epoch_loss = 0
                val_epoch_acc = 0

                model.eval()
                for sentence1, sentence2, label in val_dataloader:
                    label = label.to(device)
                    y_val_pred = model(sentence1, sentence2)

                    val_loss = criterion(y_val_pred, label)
                    val_acc = multi_acc(y_val_pred, label)

                    val_epoch_loss += val_loss.item()
                    val_epoch_acc += val_acc.item()

            loss_stats['train'].append(train_epoch_loss /
                                       len(train_dataloader))
            loss_stats['val'].append(val_epoch_loss / len(val_dataloader))
            accuracy_stats['train'].append(train_epoch_acc /
                                           len(train_dataloader))
            accuracy_stats['val'].append(val_epoch_acc / len(val_dataloader))
            print(
                f"Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_dataloader):.5f} \
                | Val Loss: {val_epoch_loss / len(val_dataloader):.5f} \
                | Train Acc: {train_epoch_acc/len(train_dataloader):.3f} \
                | Val Acc: {val_epoch_acc/len(val_dataloader):.3f}"
            )  # noqa: T001

    print("---------TRAINING ENDED------------")  # noqa: T001
Ejemplo n.º 13
0
train_nli_samples = []
with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'train':
            label_id = label2int[row['label']]
            train_nli_samples.append(
                InputExample(texts=[row['sentence1'], row['sentence2']],
                             label=label_id))

train_data_nli = SentencesDataset(train_nli_samples, model=model)
train_dataloader_nli = DataLoader(train_data_nli,
                                  shuffle=True,
                                  batch_size=batch_size)
train_loss_nli = losses.SoftmaxLoss(
    model=model,
    sentence_embedding_dimension=model.get_sentence_embedding_dimension(),
    num_labels=len(label2int))

logging.info("Read STSbenchmark train dataset")
train_sts_samples = []
dev_sts_samples = []
test_sts_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']],
                                   label=score)

        if row['split'] == 'dev':
            dev_sts_samples.append(inp_example)
Ejemplo n.º 14
0
def train_nli():

    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])
    #### /print debug information to stdout

    #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
    #model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'
    model_name = 'pretrained_model/bert-base-uncased'

    # Read the dataset
    train_batch_size = 6
    nli_reader = NLIDataReader('./examples/datasets/AllNLI')
    sts_reader = STSBenchmarkDataReader('./examples/datasets/stsbenchmark')
    train_num_labels = nli_reader.get_num_labels()
    model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])


    # Convert the dataset to a DataLoader ready for training
    logging.info("Read AllNLI train dataset")
    train_dataset = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)



    logging.info("Read STSbenchmark dev dataset")
    dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
    evaluator = LabelAccuracyEvaluator(dev_dataloader,softmax_model = Softmax_label(model = model,
                                                                                    sentence_embedding_dimension = model.get_sentence_embedding_dimension(),
                                                                                    num_labels = train_num_labels))


    # Configure the training
    num_epochs = 1

    warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))



    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=100,
            warmup_steps=warmup_steps,
            output_path=model_save_path
            )



    ##############################################################################
    #
    # Load the stored model and evaluate its performance on STS benchmark dataset
    #
    ##############################################################################

    #model = SentenceTransformer(model_save_path)
    test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
    test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
    #evaluator = EmbeddingSimilarityEvaluator(test_dataloader)

    model.evaluate(evaluator)
Ejemplo n.º 15
0
def train_self():


    train_batch_size = 8
    num_epochs = 50
    device = 'cuda:0'
    train_num_labels = 6
    evaluation_steps = 1000
    local = True

    #### Just some code to print debug information to stdout
    logging.basicConfig(format='%(asctime)s - %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S',
                        level=logging.INFO,
                        handlers=[LoggingHandler()])

    # model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'
    #model_name = 'bert-base-chinese'
    model_name = './pretrained_model/bert-base-chinese'
    #train_batch_size = config.train_batch_size

    self_reader = Self_csv_DataReader('./self_dataset',local = local)
    #train_num_labels = config.train_num_labels
    model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


    # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name,cache_dir = './pretrained_model')

    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                pooling_mode_mean_tokens=True,
                                pooling_mode_cls_token=False,
                                pooling_mode_max_tokens=False)

    model = SentenceTransformer_NoPooling(modules=[word_embedding_model])#, pooling_model])


    # Convert the dataset to a DataLoader ready for training
    logging.info("Read self train dataset")
    train_dataset = SentencesDataset(examples=self_reader.get_examples("train.csv"), model=model)
    train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
    train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_word_embedding_dimension(), num_labels=train_num_labels)



    logging.info("Read self dev dataset")
    dev_data = SentencesDataset(examples=self_reader.get_examples('dev.csv'), model=model)
    dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
    evaluator = LabelAccuracyEvaluator(dev_dataloader,softmax_model = Softmax_label(model = model,
                                                                                    sentence_embedding_dimension = model.get_word_embedding_dimension(),
                                                                                    num_labels = train_num_labels))



    warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
    logging.info("Warmup-steps: {}".format(warmup_steps))



    # Train the model
    model.fit(train_objectives=[(train_dataloader, train_loss)],
            evaluator=evaluator,
            epochs=num_epochs,
            evaluation_steps=evaluation_steps,
            warmup_steps=warmup_steps,
            output_path=model_save_path
            )