def create_and_check_xlnet_sequence_classif(
            self,
            config,
            input_ids_1,
            input_ids_2,
            input_ids_q,
            perm_mask,
            input_mask,
            target_mapping,
            segment_ids,
            lm_labels,
            sequence_labels,
            is_impossible_labels,
        ):
            model = XLNetForSequenceClassification(config)
            model.eval()

            logits, mems_1 = model(input_ids_1)
            loss, logits, mems_1 = model(input_ids_1, labels=sequence_labels)

            result = {"loss": loss, "mems_1": mems_1, "logits": logits}

            self.parent.assertListEqual(list(result["loss"].size()), [])
            self.parent.assertListEqual(
                list(result["logits"].size()), [self.batch_size, self.type_sequence_label_size]
            )
            self.parent.assertListEqual(
                list(list(mem.size()) for mem in result["mems_1"]),
                [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers,
            )
Beispiel #2
0
    def __init__(
        self,
        language=Language.ENGLISHCASED,
        num_labels=5,
        cache_dir=".",
        num_gpus=None,
        num_epochs=1,
        batch_size=8,
        lr=5e-5,
        adam_eps=1e-8,
        warmup_steps=0,
        weight_decay=0.0,
        max_grad_norm=1.0,
    ):
        """Initializes the classifier and the underlying pretrained model.

        Args:
            language (Language, optional): The pretrained model's language.
                                           Defaults to 'xlnet-base-cased'.
            num_labels (int, optional): The number of unique labels in the
                training data. Defaults to 5.
            cache_dir (str, optional): Location of XLNet's cache directory.
                Defaults to ".".
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            num_epochs (int, optional): Number of training epochs.
                Defaults to 1.
            batch_size (int, optional): Training batch size. Defaults to 8.
            lr (float): Learning rate of the Adam optimizer. Defaults to 5e-5.
            adam_eps (float, optional): term added to the denominator to improve
                                        numerical stability. Defaults to 1e-8.
            warmup_steps (int, optional): Number of steps in which to increase
                                        learning rate linearly from 0 to 1. Defaults to 0.
            weight_decay (float, optional): Weight decay. Defaults to 0.
            max_grad_norm (float, optional): Maximum norm for the gradients. Defaults to 1.0
        """

        if num_labels < 2:
            raise ValueError("Number of labels should be at least 2.")

        self.language = language
        self.num_labels = num_labels
        self.cache_dir = cache_dir

        self.num_gpus = num_gpus
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.lr = lr
        self.adam_eps = adam_eps
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.max_grad_norm = max_grad_norm

        # create classifier
        self.config = XLNetConfig.from_pretrained(self.language.value,
                                                  num_labels=num_labels,
                                                  cache_dir=cache_dir)
        self.model = XLNetForSequenceClassification(self.config)
Beispiel #3
0
def train(args, device):
    args.dataset_name = "MNLI"  # TODO: parametrize

    model_name = args.model_name
    log = get_train_logger(args)
    SEED = 42
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    log.info(f'Using device {device}')
    tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True)
    xlnet_config = XLNetConfig.from_pretrained(
        model_name,
        output_hidden_states=True,
        output_attentions=True,
        num_labels=3,
        finetuning_task=args.dataset_name)

    model = XLNetForSequenceClassification.from_pretrained(model_name,
                                                           config=xlnet_config)

    model.to(device)

    # Load features from datasets
    data_loader = MNLIDatasetReader(args, tokenizer, log)
    train_file = os.path.join(args.base_path, args.train_file)
    val_file = os.path.join(args.base_path, args.val_file)
    train_dataloader = data_loader.load_train_dataloader(train_file)
    val_dataloader = data_loader.load_val_dataloader(val_file)

    trainer = TrainModel(train_dataloader, val_dataloader, log)
    trainer.train(model, device, args)
 def create_model(self):
     if self.model_configuration.bert_model in ("xlnet-base-cased",):
         model = XLNetForSequenceClassification.from_pretrained(self.model_configuration.bert_model,
                                                                num_labels=self.model_configuration.num_labels)
     else:
         model = BertForSequenceClassification.from_pretrained(self.model_configuration.bert_model,
                                                               num_labels=self.model_configuration.num_labels)
     model.to(device)
     return model
def run(args):
    nli_model_path = 'saved_models/xlnet-base-cased/'
    model_file = os.path.join(nli_model_path, 'pytorch_model.bin')
    config_file = os.path.join(nli_model_path, 'config.json')
    log = get_logger('conduct_test')
    model_name = 'xlnet-base-cased'
    tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True)
    xlnet_config = XLNetConfig.from_pretrained(config_file)
    model = XLNetForSequenceClassification.from_pretrained(model_file,
                                                           config=xlnet_config)
    dataset_reader = ConductDatasetReader(args, tokenizer, log)
    file_lines = dataset_reader.get_file_lines('data/dados.tsv')

    results = []
    softmax_fn = torch.nn.Softmax(dim=1)

    model.eval()
    with torch.no_grad():
        for line in tqdm(file_lines):
            premise, hypothesys, conflict = dataset_reader.parse_line(line)
            pair_word_ids, input_mask, pair_segment_ids = dataset_reader.convert_text_to_features(
                premise, hypothesys)
            tensor_word_ids = torch.tensor([pair_word_ids],
                                           dtype=torch.long,
                                           device=args.device)
            tensor_input_mask = torch.tensor([input_mask],
                                             dtype=torch.long,
                                             device=args.device)
            tensor_segment_ids = torch.tensor([pair_segment_ids],
                                              dtype=torch.long,
                                              device=args.device)
            model_input = {
                'input_ids': tensor_word_ids,  # word ids
                'attention_mask': tensor_input_mask,  # input mask
                'token_type_ids': tensor_segment_ids
            }
            outputs = model(**model_input)
            logits = outputs[0]
            nli_scores, nli_class = get_scores_and_class(logits, softmax_fn)
            nli_scores = nli_scores.detach().cpu().numpy()
            results.append({
                "conduct": premise,
                "complaint": hypothesys,
                "nli_class": nli_class,
                "nli_contradiction_score": nli_scores[0],
                "nli_entailment_score": nli_scores[1],
                "nli_neutral_score": nli_scores[2],
                "conflict": conflict
            })

    df = pd.DataFrame(results)
    df.to_csv('results/final_results.tsv', sep='\t', index=False)
def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path,
                                        bert_config_file,
                                        pytorch_dump_folder_path,
                                        finetuning_task=None):
    # Initialise PyTorch model
    config = XLNetConfig.from_json_file(bert_config_file)

    finetuning_task = finetuning_task.lower(
    ) if finetuning_task is not None else ""
    if finetuning_task in GLUE_TASKS_NUM_LABELS:
        print(
            "Building PyTorch XLNetForSequenceClassification model from configuration: {}"
            .format(str(config)))
        config.finetuning_task = finetuning_task
        config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
        model = XLNetForSequenceClassification(config)
    elif 'squad' in finetuning_task:
        config.finetuning_task = finetuning_task
        model = XLNetForQuestionAnswering(config)
    else:
        model = XLNetLMHeadModel(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)

    # Save pytorch-model
    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path,
                                             WEIGHTS_NAME)
    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path,
                                            CONFIG_NAME)
    print("Save PyTorch model to {}".format(
        os.path.abspath(pytorch_weights_dump_path)))
    torch.save(model.state_dict(), pytorch_weights_dump_path)
    print("Save configuration file to {}".format(
        os.path.abspath(pytorch_config_dump_path)))
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
        f.write(config.to_json_string())
def predict_model(args, save=True):
    dataset_name = args.dataset_name[0]
    model_type = args.model_type
    test_dataset = path_tensor_dataset / f"{model_type}_{dataset_name}.pkl"
    test_dataset = pickle_load(test_dataset)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=args.batch_size,
                                 pin_memory=True,
                                 num_workers=4,
                                 shuffle=False)

    model_dir = path_model / f"{args.model_type}_{args.model_name}/checkpoint_epoch{args.epoch_num}"
    if model_type == "bert":
        model = BertForSequenceClassification.from_pretrained(model_dir,
                                                              num_labels=126)
    elif model_type == "xlnet":
        model = XLNetForSequenceClassification.from_pretrained(model_dir,
                                                               num_labels=126)
    else:
        raise ValueError("")
    model.zero_grad()
    model.eval()
    model = model.cuda(args.gpu_device_ids[0])
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, device_ids=args.gpu_device_ids)

    res = []
    for batch in tqdm(test_dataloader, desc="Iteration"):
        batch = tuple(x.cuda(args.gpu_device_ids[0]) for x in batch)
        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2]
        }
        with torch.no_grad():
            outputs = model(**inputs)[0]
        res.append(outputs)
    res = torch.cat(res, 0).cpu()
    if save:
        filename = f"{model_type}_{dataset_name}_epoch{args.epoch_num}_res.pkl"
        pickle_save(res, path_model_output / filename)
    return res
Beispiel #8
0
    def __init__(self, args, task_name, weight_file=None, config_file=None):
        self.args = args
        self.device = args.device
        self.log = self.get_train_logger(args, task_name)
        self.softmax = Softmax(dim=1)
        self.tokenizer = XLNetTokenizer.from_pretrained(args.model_name,
                                                        do_lower_case=True)
        self.dataset_reader = init_dataset_reader(task_name, args,
                                                  self.tokenizer, self.log)

        config = args.model_name if config_file is None else config_file
        model_weights = args.model_name if weight_file is None else weight_file
        xlnet_config = XLNetConfig.from_pretrained(config,
                                                   output_hidden_states=True,
                                                   output_attentions=True,
                                                   num_labels=3,
                                                   finetuning_task=task_name)

        model = XLNetForSequenceClassification.from_pretrained(
            model_weights, config=xlnet_config)
        self.model = model.to(args.device)
def validate_on_test_set(args, device):
    log = get_logger(f"test-results")
    SEED = 42
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    log.info(f'Using device {device}')

    model_name = 'xlnet-base-cased'
    tokenizer = XLNetTokenizer.from_pretrained(model_name, do_lower_case=True)
    xlnet_config = XLNetConfig.from_pretrained(args.config_file)
    data_reader = KaggleMNLIDatasetReader(args, tokenizer, log)
    model = XLNetForSequenceClassification.from_pretrained(args.model_file, config=xlnet_config)

    model.to(device)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    log.info(f'Running on {args.n_gpu} GPUS')

    test_executor = KaggleTest(tokenizer, log, data_reader)
    write_kaggle_results("matched", args.test_matched_file, test_executor, device, model)
    write_kaggle_results("mismatched", args.test_mismatched_file, test_executor, device, model)
batch_size = 12

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Load BertForSequenceClassification, the pretrained XLNet model with a single 
# linear classification layer on top. 
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=14)


# Tell pytorch to run this model on the GPU.
model.cuda()

# Get all of the model's parameters as a list of tuples.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
def main(log_in_file, lm_path, lm_type, data_path, usegpu, n_fold, total_step,
         eval_every, early_stop, lr, weight_decay, lr_decay_in_layers,
         wd_decay_in_layers, max_length, max_title_rate, content_head_rate,
         batch_size, lr_scheduler_type, input_pattern, clean_method,
         warmup_rate, classifier_dropout, classifier_active, seed):
    arg_name_value_pairs = deepcopy(locals())
    prefix = time.strftime('%Y%m%d_%H%M')
    logger = logging.getLogger('default')
    formatter = logging.Formatter("%(asctime)s %(message)s")
    if log_in_file:
        handler1 = logging.FileHandler(prefix + '.log')
        handler1.setFormatter(formatter)
        handler1.setLevel(logging.DEBUG)
        logger.addHandler(handler1)
    handler2 = logging.StreamHandler()
    handler2.setFormatter(formatter)
    handler2.setLevel(logging.DEBUG)
    logger.addHandler(handler2)
    logger.setLevel(logging.DEBUG)
    for arg_name, arg_value in arg_name_value_pairs.items():
        logger.info(f'{arg_name}: {arg_value}')
    global tokenizer
    if lm_type == 'bert':
        tokenizer = BertTokenizer(os.path.join(lm_path, 'vocab.txt'))
    else:
        tokenizer = XLNetTokenizer(os.path.join(lm_path, 'spiece.model'))
        global PAD, PAD_t, CLS_t, SEP_t
        PAD_t = '<pad>'
        CLS_t = '<cls>'
        SEP_t = '<sep>'
        PAD = tokenizer.convert_tokens_to_ids([PAD_t])[0]
    logger.info(f'padding token is {PAD}')
    processed_train = preprocess(
        os.path.join(data_path, 'Train_DataSet.csv'),
        os.path.join(data_path,
                     'Train_DataSet_Label.csv'), tokenizer, max_length,
        input_pattern, clean_method, max_title_rate, content_head_rate, logger)
    processed_test = preprocess(os.path.join(data_path, 'Test_DataSet.csv'),
                                False, tokenizer, max_length, input_pattern,
                                clean_method, max_title_rate,
                                content_head_rate, logger)
    logger.info('seed everything and create model')
    seed_everything(seed)
    no_decay = ['.bias', 'LayerNorm.bias', 'LayerNorm.weight']
    if lm_type == 'xlnet':
        model = XLNetForSequenceClassification.from_pretrained(
            lm_path, num_labels=3, summary_last_dropout=classifier_dropout)
        if classifier_active == 'relu':
            model.sequence_summary.activation = nn.ReLU()
        if usegpu:
            model = model.cuda()
        model_layer_names = [
            'transformer.mask_emb', 'transformer.word_embedding.weight'
        ]
        model_layer_names += [
            f'transformer.layer.{i}.' for i in range(model.config.n_layer)
        ]
        model_layer_names += ['sequence_summary.summary', 'logits_proj']
    else:
        model = BertForSequenceClassification.from_pretrained(
            lm_path, num_labels=3, hidden_dropout_prob=classifier_dropout)
        if classifier_active == 'relu':
            model.bert.pooler.activation = nn.ReLU()
        if usegpu:
            model = model.cuda()
        model_layer_names = ['bert.embeddings']
        model_layer_names += [
            'bert.encoder.layer.{}.'.format(i)
            for i in range(model.config.num_hidden_layers)
        ]
        model_layer_names += ['bert.pooler', 'classifier']
    optimizer = optimizer = AdamW([{
        'params': [
            p for n, p in model.named_parameters()
            if layer_name in n and not any(nd in n for nd in no_decay)
        ],
        'lr':
        lr * (lr_decay_in_layers**i),
        'weight_decay':
        weight_decay * (wd_decay_in_layers**i)
    } for i, layer_name in enumerate(model_layer_names[::-1])] + [{
        'params': [
            p for n, p in model.named_parameters()
            if layer_name in n and any(nd in n for nd in no_decay)
        ],
        'lr':
        lr * (lr_decay_in_layers**i),
        'weight_decay':
        .0
    } for i, layer_name in enumerate(model_layer_names[::-1])])
    if lr_scheduler_type == 'linear':
        lr_scheduler = WarmupLinearSchedule(optimizer,
                                            warmup_steps=warmup_rate,
                                            t_total=total_step)
    elif lr_scheduler_type == 'constant':
        lr_scheduler = WarmupConstantSchedule(optimizer,
                                              warmup_steps=warmup_rate)
    else:
        raise ValueError

    model_state_0 = deepcopy(model.state_dict())
    optimizer_state_0 = deepcopy(optimizer.state_dict())

    test_iter = get_data_iter(processed_test,
                              batch_size * 4,
                              collect_test_func,
                              shuffle=False)
    pred = np.zeros((len(processed_test), 3))
    val_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(
            KFold(n_splits=n_fold, shuffle=True,
                  random_state=seed).split(processed_train)):
        model.load_state_dict(model_state_0)
        optimizer.load_state_dict(optimizer_state_0)
        if lr_scheduler_type == 'linear':
            lr_scheduler = WarmupLinearSchedule(optimizer,
                                                warmup_steps=warmup_rate,
                                                t_total=total_step)
        elif lr_scheduler_type == 'constant':
            lr_scheduler = WarmupConstantSchedule(optimizer,
                                                  warmup_steps=warmup_rate)
        else:
            raise ValueError
        train_iter = get_data_iter([processed_train[i] for i in train_idx],
                                   batch_size, collect_func)
        val_iter = get_data_iter([processed_train[i] for i in val_idx],
                                 batch_size * 4,
                                 collect_func,
                                 shuffle=False)

        best_model, best_score = training(model=model,
                                          optimizer=optimizer,
                                          lr_scheduler=lr_scheduler,
                                          train_iter=train_iter,
                                          val_iter=val_iter,
                                          total_step=total_step,
                                          tokenizer=tokenizer,
                                          usegpu=usegpu,
                                          eval_every=eval_every,
                                          logger=logger,
                                          early_stop=early_stop,
                                          fold_idx=fold_idx)
        model.load_state_dict(best_model)
        val_scores.append(best_score)
        pred += predict(model, test_iter, usegpu)
    logger.info(f'average: {np.mean(val_scores):.6f}')
    pred = pred / n_fold
    prob_df = pd.DataFrame()
    submit = pd.DataFrame()
    submit['id'] = [i['id'] for i in processed_test]
    submit['label'] = pred.argmax(-1)
    prob_df['id'] = [i['id'] for i in processed_test]
    prob_df['0'] = pred[:, 0]
    prob_df['1'] = pred[:, 1]
    prob_df['2'] = pred[:, 2]
    submit.to_csv(f'submit_{prefix}.csv', index=False)
    prob_df.to_csv(f'probability_{prefix}.csv', index=False)
Beispiel #12
0
# Drop last can make batch training better for the last one
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_num,drop_last=True)
valid_data = TensorDataset(val_inputs, val_masks,val_segs, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_num)

#Train model
#Load XLNet model
# In this document, contain confg(txt) and weight(bin) files
#model_file_address = 'xlnet-base-cased'
model_file_address = '/home/saul/deeplearning/xlnet'
# Will load config and weight with from_pretrained()
# Recommand download the model before using
# Download model from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin"
# Download model from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json" 
model = XLNetForSequenceClassification.from_pretrained(model_file_address,num_labels=len(tag2idx))

# Set model to GPU,if you are using GPU machine
model.to(device)
# Add multi GPU support
if n_gpu >1:
    model = torch.nn.DataParallel(model)
# Set epoch and grad max num
epochs = 10
#epochs = 3
max_grad_norm = 1.0

# Calcuate train optimiazaion num
num_train_optimization_steps = int( math.ceil(len(tr_inputs) / batch_num) / 1) * epochs

#Set fine tuning method
def Train(inputIds, attention_masks, labels, batch_size=24, epochs=10):
    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
        inputIds, labels, random_state=2020, test_size=0.2)
    train_masks, validation_masks, _, _ = train_test_split(attention_masks,
                                                           inputIds,
                                                           random_state=2020,
                                                           test_size=0.2)
    # Turn data into torch tensors
    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)
    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)
    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)

    # Create Iterators of the datasets
    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=batch_size)
    validation_data = TensorDataset(validation_inputs, validation_masks,
                                    validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data,
                                       sampler=validation_sampler,
                                       batch_size=batch_size)

    model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased',
                                                           num_labels=2)
    # Loads model into GPU memory
    model.cuda()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)

    # train_loss_set = []

    # Find GPU or CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    trainLoss = []
    valAcc = []
    for _ in trange(epochs, desc='Epoch'):
        # Train
        model.train()

        trainLoss.append(0)
        nb_tr_examples, nb_tr_steps = 0, 0

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            optimizer.zero_grad()
            # Forward pass and loss calculation
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
            loss = outputs[0]
            logits = outputs[1]
            # Calculate gradients
            loss.backward()
            # Update weights using gradients
            optimizer.step()

            trainLoss[-1] += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        print('\nTrain loss: {}'.format(trainLoss[-1] / nb_tr_steps))

        # Valuation
        model.eval()

        nb_eval_steps = 0
        valAcc.append(0)
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            # Don't calculate gradients since we are evaluating the model
            with torch.no_grad():
                output = model(b_input_ids,
                               token_type_ids=None,
                               attention_mask=b_input_mask)
                logits = output[0]
            # Grab logistic values from GPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            valAcc[-1] += tmp_eval_accuracy
            nb_eval_steps += 1

        print('\nValidation Accuracy: {}\n'.format(valAcc[-1] / nb_eval_steps))

    return model, trainLoss, valAcc
Beispiel #14
0
def main(_):
    if FLAGS.server_ip and FLAGS.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(FLAGS.server_ip, FLAGS.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    tf.set_random_seed(FLAGS.seed)
    np.random.seed(FLAGS.seed)

    tf.logging.set_verbosity(tf.logging.INFO)

    #### Validate flags
    if FLAGS.save_steps is not None:
        FLAGS.log_step_count_steps = min(FLAGS.log_step_count_steps,
                                         FLAGS.save_steps)

    if FLAGS.do_predict:
        predict_dir = FLAGS.predict_dir
        if not tf.gfile.Exists(predict_dir):
            tf.gfile.MakeDirs(predict_dir)

    processors = {
        "mnli_matched": MnliMatchedProcessor,
        "mnli_mismatched": MnliMismatchedProcessor,
        'sts-b': StsbProcessor,
        'imdb': ImdbProcessor,
        "yelp5": Yelp5Processor
    }

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval, `do_predict` or "
            "`do_submit` must be True.")

    if not tf.gfile.Exists(FLAGS.output_dir):
        tf.gfile.MakeDirs(FLAGS.output_dir)

    if not tf.gfile.Exists(FLAGS.model_dir):
        tf.gfile.MakeDirs(FLAGS.model_dir)

#   ########################### LOAD PT model
#   ########################### LOAD PT model
#   import torch
#   from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification

#   save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME)
#   tf.logging.info("Model loaded from path: {}".format(save_path))

#   device = torch.device("cuda", 4)
#   config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b')
#   config_path = os.path.join(FLAGS.model_dir, CONFIG_NAME)
#   config.to_json_file(config_path)
#   pt_model = XLNetForSequenceClassification.from_pretrained(FLAGS.model_dir, from_tf=True, num_labels=1)
#   pt_model.to(device)
#   pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7])

#   from torch.optim import Adam
#   optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999),
#                     eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay,
#                     amsgrad=False)
#   ########################### LOAD PT model
#   ########################### LOAD PT model

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels() if not FLAGS.is_regression else None

    sp = spm.SentencePieceProcessor()
    sp.Load(FLAGS.spiece_model_file)

    def tokenize_fn(text):
        text = preprocess_text(text, lower=FLAGS.uncased)
        return encode_ids(sp, text)

    # run_config = model_utils.configure_tpu(FLAGS)


#   model_fn = get_model_fn(len(label_list) if label_list is not None else None)

    spm_basename = os.path.basename(FLAGS.spiece_model_file)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    # estimator = tf.estimator.Estimator(
    #     model_fn=model_fn,
    #     config=run_config)

    if FLAGS.do_train:
        train_file_base = "{}.len-{}.train.tf_record".format(
            spm_basename, FLAGS.max_seq_length)
        train_file = os.path.join(FLAGS.output_dir, train_file_base)
        tf.logging.info("Use tfrecord file {}".format(train_file))

        train_examples = processor.get_train_examples(FLAGS.data_dir)
        tf.logging.info("Num of train samples: {}".format(len(train_examples)))

        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenize_fn, train_file,
                                                FLAGS.num_passes)

        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)

        # estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps)

        ##### Create input tensors / placeholders
        bsz_per_core = FLAGS.train_batch_size // FLAGS.num_core_per_host

        params = {
            "batch_size": FLAGS.train_batch_size  # the whole batch
        }
        train_set = train_input_fn(params)

        example = train_set.make_one_shot_iterator().get_next()
        if FLAGS.num_core_per_host > 1:
            examples = [{} for _ in range(FLAGS.num_core_per_host)]
            for key in example.keys():
                vals = tf.split(example[key], FLAGS.num_core_per_host, 0)
                for device_id in range(FLAGS.num_core_per_host):
                    examples[device_id][key] = vals[device_id]
        else:
            examples = [example]

        ##### Create computational graph
        tower_losses, tower_grads_and_vars, tower_inputs, tower_hidden_states, tower_logits = [], [], [], [], []

        for i in range(FLAGS.num_core_per_host):
            reuse = True if i > 0 else None
            with tf.device(assign_to_gpu(i, "/gpu:0")), \
                tf.variable_scope(tf.get_variable_scope(), reuse=reuse):

                loss_i, grads_and_vars_i, inputs_i, hidden_states_i, logits_i = single_core_graph(
                    is_training=True,
                    features=examples[i],
                    label_list=label_list)

                tower_losses.append(loss_i)
                tower_grads_and_vars.append(grads_and_vars_i)
                tower_inputs.append(inputs_i)
                tower_hidden_states.append(hidden_states_i)
                tower_logits.append(logits_i)

        ## average losses and gradients across towers
        if len(tower_losses) > 1:
            loss = tf.add_n(tower_losses) / len(tower_losses)
            grads_and_vars = average_grads_and_vars(tower_grads_and_vars)
            inputs = dict((n, tf.concat([t[n] for t in tower_inputs], 0))
                          for n in tower_inputs[0])
            hidden_states = list(
                tf.concat(t, 0) for t in zip(*tower_hidden_states))
            logits = tf.concat(tower_logits, 0)
        else:
            loss = tower_losses[0]
            grads_and_vars = tower_grads_and_vars[0]
            inputs = tower_inputs[0]
            hidden_states = tower_hidden_states[0]
            logits = tower_logits[0]

        # Summaries
        merged = tf.summary.merge_all()

        ## get train op
        train_op, learning_rate, gnorm = model_utils.get_train_op(
            FLAGS, None, grads_and_vars=grads_and_vars)
        global_step = tf.train.get_global_step()

        ##### Training loop
        saver = tf.train.Saver(max_to_keep=FLAGS.max_save)

        gpu_options = tf.GPUOptions(allow_growth=True)

        #### load pretrained models
        model_utils.init_from_checkpoint(FLAGS, global_vars=True)

        writer = tf.summary.FileWriter(logdir=FLAGS.model_dir,
                                       graph=tf.get_default_graph())
        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True, gpu_options=gpu_options)) as sess:
            sess.run(tf.global_variables_initializer())

            #########
            ##### PYTORCH
            import torch
            from torch.optim import Adam
            from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification, BertAdam

            save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME + '-00')
            saver.save(sess, save_path)
            tf.logging.info("Model saved in path: {}".format(save_path))

            device = torch.device("cuda", 4)
            config = XLNetConfig.from_pretrained('xlnet-large-cased',
                                                 finetuning_task=u'sts-b',
                                                 num_labels=1)
            tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')

            # pt_model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=1)
            pt_model = XLNetForSequenceClassification.from_pretrained(
                save_path, from_tf=True, config=config)
            pt_model.to(device)
            pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7])

            optimizer = Adam(pt_model.parameters(),
                             lr=0.001,
                             betas=(0.9, 0.999),
                             eps=FLAGS.adam_epsilon,
                             weight_decay=FLAGS.weight_decay,
                             amsgrad=False)
            # optimizer = BertAdam(pt_model.parameters(), lr=FLAGS.learning_rate, t_total=FLAGS.train_steps, warmup=FLAGS.warmup_steps / FLAGS.train_steps,
            #                      eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay)
            ##### PYTORCH
            #########

            fetches = [
                loss, global_step, gnorm, learning_rate, train_op, merged,
                inputs, hidden_states, logits
            ]

            total_loss, total_loss_pt, prev_step, gnorm_pt = 0., 0., -1, 0.0
            total_logits = None
            total_labels = None
            while True:
                feed_dict = {}
                # for i in range(FLAGS.num_core_per_host):
                #   for key in tower_mems_np[i].keys():
                #     for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]):
                #       feed_dict[m] = m_np

                fetched = sess.run(fetches)

                loss_np, curr_step, gnorm_np, learning_rate_np, _, summary_np, inputs_np, hidden_states_np, logits_np = fetched
                total_loss += loss_np

                if total_logits is None:
                    total_logits = logits_np
                    total_labels = inputs_np['label_ids']
                else:
                    total_logits = np.append(total_logits, logits_np, axis=0)
                    total_labels = np.append(total_labels,
                                             inputs_np['label_ids'],
                                             axis=0)

                #########
                ##### PYTORCH
                f_inp = torch.tensor(inputs_np["input_ids"],
                                     dtype=torch.long,
                                     device=device)
                f_seg_id = torch.tensor(inputs_np["segment_ids"],
                                        dtype=torch.long,
                                        device=device)
                f_inp_mask = torch.tensor(inputs_np["input_mask"],
                                          dtype=torch.float,
                                          device=device)
                f_label = torch.tensor(inputs_np["label_ids"],
                                       dtype=torch.float,
                                       device=device)

                # with torch.no_grad():
                #   _, hidden_states_pt, _ = pt_model.transformer(f_inp, f_seg_id, f_inp_mask)
                # logits_pt, _ = pt_model(f_inp, token_type_ids=f_seg_id, input_mask=f_inp_mask)

                pt_model.train()
                outputs = pt_model(f_inp,
                                   token_type_ids=f_seg_id,
                                   input_mask=f_inp_mask,
                                   labels=f_label)
                loss_pt = outputs[0]
                loss_pt = loss_pt.mean()
                total_loss_pt += loss_pt.item()

                # # hidden_states_pt = list(t.detach().cpu().numpy() for t in hidden_states_pt)
                # # special_pt = special_pt.detach().cpu().numpy()

                # # Optimizer pt
                pt_model.zero_grad()
                loss_pt.backward()
                gnorm_pt = torch.nn.utils.clip_grad_norm_(
                    pt_model.parameters(), FLAGS.clip)
                for param_group in optimizer.param_groups:
                    param_group['lr'] = learning_rate_np
                optimizer.step()
                ##### PYTORCH
                #########

                if curr_step > 0 and curr_step % FLAGS.log_step_count_steps == 0:
                    curr_loss = total_loss / (curr_step - prev_step)
                    curr_loss_pt = total_loss_pt / (curr_step - prev_step)
                    tf.logging.info(
                        "[{}] | gnorm {:.2f} lr {:8.6f} "
                        "| loss {:.2f} | pplx {:>7.2f}, bpc {:>7.4f}".format(
                            curr_step, gnorm_np, learning_rate_np, curr_loss,
                            math.exp(curr_loss), curr_loss / math.log(2)))

                    #########
                    ##### PYTORCH
                    tf.logging.info(
                        "  PT [{}] | gnorm PT {:.2f} lr PT {:8.6f} "
                        "| loss PT {:.2f} | pplx PT {:>7.2f}, bpc PT {:>7.4f}".
                        format(curr_step, gnorm_pt, learning_rate_np,
                               curr_loss_pt, math.exp(curr_loss_pt),
                               curr_loss_pt / math.log(2)))
                    ##### PYTORCH
                    #########

                    total_loss, total_loss_pt, prev_step = 0., 0., curr_step
                    writer.add_summary(summary_np, global_step=curr_step)

                if curr_step > 0 and curr_step % FLAGS.save_steps == 0:
                    save_path = os.path.join(FLAGS.model_dir,
                                             "model.ckpt-{}".format(curr_step))
                    saver.save(sess, save_path)
                    tf.logging.info(
                        "Model saved in path: {}".format(save_path))

                    #########
                    ##### PYTORCH
                    # Save a trained model, configuration and tokenizer
                    model_to_save = pt_model.module if hasattr(
                        pt_model,
                        'module') else pt_model  # Only save the model it-self
                    # If we save using the predefined names, we can load using `from_pretrained`
                    output_dir = os.path.join(
                        FLAGS.output_dir, "pytorch-ckpt-{}".format(curr_step))
                    if not tf.gfile.Exists(output_dir):
                        tf.gfile.MakeDirs(output_dir)
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    tf.logging.info(
                        "PyTorch Model saved in path: {}".format(output_dir))
                    ##### PYTORCH
                    #########

                if curr_step >= FLAGS.train_steps:
                    break

    if FLAGS.do_eval:
        # TPU requires a fixed batch size for all batches, therefore the number
        # of examples must be a multiple of the batch size, or else examples
        # will get dropped. So we pad with fake examples which are ignored
        # later on. These do NOT count towards the metric (all tf.metrics
        # support a per-instance weight, and these get a weight of 0.0).
        #
        # Modified in XL: We also adopt the same mechanism for GPUs.
        while len(eval_examples) % FLAGS.eval_batch_size != 0:
            eval_examples.append(PaddingInputExample())

        eval_file_base = "{}.len-{}.{}.eval.tf_record".format(
            spm_basename, FLAGS.max_seq_length, FLAGS.eval_split)
        eval_file = os.path.join(FLAGS.output_dir, eval_file_base)

        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenize_fn, eval_file)

        assert len(eval_examples) % FLAGS.eval_batch_size == 0
        eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=True)

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True, gpu_options=gpu_options)) as sess:
            sess.run(tf.global_variables_initializer())

            ########################### LOAD PT model
            #   import torch
            #   from pytorch_transformers import CONFIG_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, XLNetTokenizer, XLNetConfig, XLNetForSequenceClassification, BertAdam

            #   save_path = os.path.join(FLAGS.model_dir, TF_WEIGHTS_NAME)
            #   saver.save(sess, save_path)
            #   tf.logging.info("Model saved in path: {}".format(save_path))

            #   device = torch.device("cuda", 4)
            #   config = XLNetConfig.from_pretrained('xlnet-large-cased', finetuning_task=u'sts-b', num_labels=1)
            #   tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
            #   config_path = os.path.join(FLAGS.model_dir, CONFIG_NAME)
            #   config.to_json_file(config_path)
            #   # pt_model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased', num_labels=1)
            #   pt_model = XLNetForSequenceClassification.from_pretrained(FLAGS.model_dir, from_tf=True)
            #   pt_model.to(device)
            #   pt_model = torch.nn.DataParallel(pt_model, device_ids=[4, 5, 6, 7])
            #   from torch.optim import Adam
            #   optimizer = Adam(pt_model.parameters(), lr=0.001, betas=(0.9, 0.999),
            #                    eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay,
            #                    amsgrad=False)
            #   optimizer = BertAdam(pt_model.parameters(), lr=FLAGS.learning_rate, t_total=FLAGS.train_steps, warmup=FLAGS.warmup_steps / FLAGS.train_steps,
            #                        eps=FLAGS.adam_epsilon, weight_decay=FLAGS.weight_decay)

            ##### PYTORCH
            #########

            fetches = [
                loss, global_step, gnorm, learning_rate, train_op, merged,
                inputs, hidden_states, logits
            ]

            total_loss, total_loss_pt, prev_step, gnorm_pt = 0., 0., -1, 0.0
            total_logits = None
            total_labels = None
            while True:
                feed_dict = {}
                # for i in range(FLAGS.num_core_per_host):
                #   for key in tower_mems_np[i].keys():
                #     for m, m_np in zip(tower_mems[i][key], tower_mems_np[i][key]):
                #       feed_dict[m] = m_np

                fetched = sess.run(fetches)

                loss_np, curr_step, gnorm_np, learning_rate_np, _, summary_np, inputs_np, hidden_states_np, logits_np = fetched
                total_loss += loss_np

                if total_logits is None:
                    total_logits = logits_np
                    total_labels = inputs_np['label_ids']
                else:
                    total_logits = np.append(total_logits, logits_np, axis=0)
                    total_labels = np.append(total_labels,
                                             inputs_np['label_ids'],
                                             axis=0)
def train_model(args):
    set_seed(args.seed)

    train_dataset = [
        path_tensor_dataset / f"{args.model_type}_{x}.pkl"
        for x in args.dataset_name
    ]
    train_dataset = [pickle_load(x) for x in train_dataset]
    train_dataset = ConcatDataset(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=args.batch_size,
                                  pin_memory=True,
                                  num_workers=4,
                                  shuffle=True)

    if args.model_type == "bert":
        model = BertForSequenceClassification.from_pretrained(path_bert_model,
                                                              num_labels=126)
    elif args.model_type == "xlnet":
        model = XLNetForSequenceClassification.from_pretrained(
            path_xlnet_model, num_labels=126)
    else:
        raise ValueError("")
    model.zero_grad()
    model = model.cuda(args.gpu_device_ids[0])
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model, device_ids=args.gpu_device_ids)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [{
        "params": [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay":
        0.01
    }, {
        "params": [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        "weight_decay":
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

    total_steps = len(
        train_dataloader) * args.epoch_num // args.gradient_accumulation_steps
    warmup_steps = int(total_steps * args.warmup_proportion)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=total_steps)

    global_step = 0
    train_iterator = trange(int(args.epoch_num), desc="Epoch")
    for i in train_iterator:
        epoch = i + 1
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for batch in epoch_iterator:
            model.train()
            batch = tuple(x.cuda(args.gpu_device_ids[0]) for x in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "labels": batch[3]
            }
            outputs = model(**inputs)
            loss = outputs[0]
            if args.n_gpu > 1:
                loss = loss.mean()
            loss.backward()
            if global_step % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                model.zero_grad()
            global_step += 1

        output_dir = f"{args.model_type}_{args.model_name}/checkpoint_epoch{epoch}"
        output_dir = path_model / output_dir
        output_dir.mkdir(parents=True, exist_ok=True)
        model_to_save = model.module if hasattr(model, "module") else model
        model_to_save.save_pretrained(output_dir)
        pickle_save(args, os.path.join(output_dir, "training_args.pkl"))
Beispiel #16
0
class XLNetSequenceClassifier:
    """XLNet-based sequence classifier"""
    def __init__(
        self,
        language=Language.ENGLISHCASED,
        num_labels=5,
        cache_dir=".",
        num_gpus=None,
        num_epochs=1,
        batch_size=8,
        lr=5e-5,
        adam_eps=1e-8,
        warmup_steps=0,
        weight_decay=0.0,
        max_grad_norm=1.0,
    ):
        """Initializes the classifier and the underlying pretrained model.

        Args:
            language (Language, optional): The pretrained model's language.
                                           Defaults to 'xlnet-base-cased'.
            num_labels (int, optional): The number of unique labels in the
                training data. Defaults to 5.
            cache_dir (str, optional): Location of XLNet's cache directory.
                Defaults to ".".
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            num_epochs (int, optional): Number of training epochs.
                Defaults to 1.
            batch_size (int, optional): Training batch size. Defaults to 8.
            lr (float): Learning rate of the Adam optimizer. Defaults to 5e-5.
            adam_eps (float, optional): term added to the denominator to improve
                                        numerical stability. Defaults to 1e-8.
            warmup_steps (int, optional): Number of steps in which to increase
                                        learning rate linearly from 0 to 1. Defaults to 0.
            weight_decay (float, optional): Weight decay. Defaults to 0.
            max_grad_norm (float, optional): Maximum norm for the gradients. Defaults to 1.0
        """

        if num_labels < 2:
            raise ValueError("Number of labels should be at least 2.")

        self.language = language
        self.num_labels = num_labels
        self.cache_dir = cache_dir

        self.num_gpus = num_gpus
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.lr = lr
        self.adam_eps = adam_eps
        self.warmup_steps = warmup_steps
        self.weight_decay = weight_decay
        self.max_grad_norm = max_grad_norm

        # create classifier
        self.config = XLNetConfig.from_pretrained(self.language.value,
                                                  num_labels=num_labels,
                                                  cache_dir=cache_dir)
        self.model = XLNetForSequenceClassification(self.config)

    def fit(
        self,
        token_ids,
        input_mask,
        labels,
        val_token_ids,
        val_input_mask,
        val_labels,
        token_type_ids=None,
        val_token_type_ids=None,
        verbose=True,
        logging_steps=0,
        save_steps=0,
        val_steps=0,
    ):
        """Fine-tunes the XLNet classifier using the given training data.

        Args:
            token_ids (list): List of training token id lists.
            input_mask (list): List of input mask lists.
            labels (list): List of training labels.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            verbose (bool, optional): If True, shows the training progress and
                loss values. Defaults to True.
        """

        device = get_device("cpu" if self.num_gpus == 0
                            or not torch.cuda.is_available() else "gpu")
        self.model = move_to_device(self.model, device, self.num_gpus)

        token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
        input_mask_tensor = torch.tensor(input_mask, dtype=torch.long)
        labels_tensor = torch.tensor(labels, dtype=torch.long)

        val_token_ids_tensor = torch.tensor(val_token_ids, dtype=torch.long)
        val_input_mask_tensor = torch.tensor(val_input_mask, dtype=torch.long)
        val_labels_tensor = torch.tensor(val_labels, dtype=torch.long)

        if token_type_ids:
            token_type_ids_tensor = torch.tensor(token_type_ids,
                                                 dtype=torch.long)
            val_token_type_ids_tensor = torch.tensor(val_token_type_ids,
                                                     dtype=torch.long)

            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor,
                                          token_type_ids_tensor, labels_tensor)

            val_dataset = TensorDataset(
                val_token_ids_tensor,
                val_input_mask_tensor,
                val_token_type_ids_tensor,
                val_labels_tensor,
            )

        else:

            train_dataset = TensorDataset(token_ids_tensor, input_mask_tensor,
                                          labels_tensor)

            val_dataset = TensorDataset(val_token_ids_tensor,
                                        val_input_mask_tensor,
                                        val_labels_tensor)

        # define optimizer and model parameters
        param_optimizer = list(self.model.named_parameters())
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.weight_decay,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        val_sampler = RandomSampler(val_dataset)

        val_dataloader = DataLoader(val_dataset,
                                    sampler=val_sampler,
                                    batch_size=self.batch_size)

        num_examples = len(token_ids)
        num_batches = int(np.ceil(num_examples / self.batch_size))
        num_train_optimization_steps = num_batches * self.num_epochs

        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.lr,
                          eps=self.adam_eps)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=self.warmup_steps,
                                         t_total=num_train_optimization_steps)

        global_step = 0
        self.model.train()
        optimizer.zero_grad()
        for epoch in range(self.num_epochs):

            train_sampler = RandomSampler(train_dataset)

            train_dataloader = DataLoader(train_dataset,
                                          sampler=train_sampler,
                                          batch_size=self.batch_size)

            tr_loss = 0.0
            logging_loss = 0.0
            val_loss = 0.0

            for i, batch in enumerate(tqdm(train_dataloader,
                                           desc="Iteration")):
                if token_type_ids:
                    x_batch, mask_batch, token_type_ids_batch, y_batch = tuple(
                        t.to(device) for t in batch)
                else:
                    token_type_ids_batch = None
                    x_batch, mask_batch, y_batch = tuple(
                        t.to(device) for t in batch)

                outputs = self.model(
                    input_ids=x_batch,
                    token_type_ids=token_type_ids_batch,
                    attention_mask=mask_batch,
                    labels=y_batch,
                )

                loss = outputs[
                    0]  # model outputs are always tuple in pytorch-transformers

                loss.sum().backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               self.max_grad_norm)

                tr_loss += loss.sum().item()
                optimizer.step()
                # Update learning rate schedule
                scheduler.step()
                optimizer.zero_grad()
                global_step += 1
                # logging of learning rate and loss
                if logging_steps > 0 and global_step % logging_steps == 0:
                    mlflow.log_metric("learning rate",
                                      scheduler.get_lr()[0],
                                      step=global_step)
                    mlflow.log_metric(
                        "training loss",
                        (tr_loss - logging_loss) /
                        (logging_steps * self.batch_size),
                        step=global_step,
                    )
                    logging_loss = tr_loss
                # model checkpointing
                if save_steps > 0 and global_step % save_steps == 0:
                    checkpoint_dir = os.path.join(os.getcwd(), "checkpoints")
                    if not os.path.isdir(checkpoint_dir):
                        os.makedirs(checkpoint_dir)
                    checkpoint_path = checkpoint_dir + "/" + str(
                        global_step) + ".pth"
                    torch.save(self.model.state_dict(), checkpoint_path)
                    mlflow.log_artifact(checkpoint_path)
                # model validation
                if val_steps > 0 and global_step % val_steps == 0:
                    # run model on validation set
                    self.model.eval()
                    val_loss = 0.0
                    for j, val_batch in enumerate(val_dataloader):
                        if token_type_ids:
                            val_x_batch, val_mask_batch, val_token_type_ids_batch, \
                            val_y_batch = tuple(
                                t.to(device) for t in val_batch
                            )
                        else:
                            token_type_ids_batch = None
                            val_x_batch, val_mask_batch, val_y_batch = tuple(
                                t.to(device) for t in val_batch)
                        val_outputs = self.model(
                            input_ids=val_x_batch,
                            token_type_ids=val_token_type_ids_batch,
                            attention_mask=val_mask_batch,
                            labels=val_y_batch,
                        )
                        vloss = val_outputs[0]
                        val_loss += vloss.sum().item()
                    mlflow.log_metric("validation loss",
                                      val_loss / len(val_dataset),
                                      step=global_step)
                    self.model.train()

                if verbose:
                    if i % ((num_batches // 10) + 1) == 0:
                        if val_loss > 0:
                            print(
                                "epoch:{}/{}; batch:{}->{}/{}; average training loss:{:.6f};\
                                 average val loss:{:.6f}".format(
                                    epoch + 1,
                                    self.num_epochs,
                                    i + 1,
                                    min(i + 1 + num_batches // 10,
                                        num_batches),
                                    num_batches,
                                    tr_loss / (i + 1),
                                    val_loss / (j + 1),
                                ), )
                        else:
                            print(
                                "epoch:{}/{}; batch:{}->{}/{}; average train loss:{:.6f}"
                                .format(
                                    epoch + 1,
                                    self.num_epochs,
                                    i + 1,
                                    min(i + 1 + num_batches // 10,
                                        num_batches),
                                    num_batches,
                                    tr_loss / (i + 1),
                                ))
        checkpoint_dir = os.path.join(os.getcwd(), "checkpoints")
        if not os.path.isdir(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        checkpoint_path = checkpoint_dir + "/" + "final" + ".pth"
        torch.save(self.model.state_dict(), checkpoint_path)
        mlflow.log_artifact(checkpoint_path)
        # empty cache
        del [x_batch, y_batch, mask_batch, token_type_ids_batch]
        if val_steps > 0:
            del [
                val_x_batch, val_y_batch, val_mask_batch,
                val_token_type_ids_batch
            ]
        torch.cuda.empty_cache()

    def predict(
        self,
        token_ids,
        input_mask,
        token_type_ids=None,
        num_gpus=None,
        batch_size=8,
        probabilities=False,
    ):
        """Scores the given dataset and returns the predicted classes.

        Args:
            token_ids (list): List of training token lists.
            input_mask (list): List of input mask lists.
            token_type_ids (list, optional): List of lists. Each sublist
                contains segment ids indicating if the token belongs to
                the first sentence(0) or second sentence(1). Only needed
                for two-sentence tasks.
            num_gpus (int, optional): The number of gpus to use.
                                      If None is specified, all available GPUs
                                      will be used. Defaults to None.
            batch_size (int, optional): Scoring batch size. Defaults to 8.
            probabilities (bool, optional):
                If True, the predicted probability distribution
                is also returned. Defaults to False.
        Returns:
            1darray, namedtuple(1darray, ndarray): Predicted classes or
                (classes, probabilities) if probabilities is True.
        """

        device = get_device(
            "cpu" if num_gpus == 0 or not torch.cuda.is_available() else "gpu")
        self.model = move_to_device(self.model, device, num_gpus)

        self.model.eval()
        preds = []

        with tqdm(total=len(token_ids)) as pbar:
            for i in range(0, len(token_ids), batch_size):
                start = i
                end = start + batch_size
                x_batch = torch.tensor(token_ids[start:end],
                                       dtype=torch.long,
                                       device=device)
                mask_batch = torch.tensor(input_mask[start:end],
                                          dtype=torch.long,
                                          device=device)

                token_type_ids_batch = torch.tensor(token_type_ids[start:end],
                                                    dtype=torch.long,
                                                    device=device)

                with torch.no_grad():
                    pred_batch = self.model(
                        input_ids=x_batch,
                        token_type_ids=token_type_ids_batch,
                        attention_mask=mask_batch,
                        labels=None,
                    )
                    preds.append(pred_batch[0].cpu())
                    if i % batch_size == 0:
                        pbar.update(batch_size)

            preds = np.concatenate(preds)

            if probabilities:
                return namedtuple("Predictions", "classes probabilities")(
                    preds.argmax(axis=1),
                    nn.Softmax(dim=1)(torch.Tensor(preds)).numpy())
            else:
                return preds.argmax(axis=1)