Esempio n. 1
0
    def callback(_locals, _globals):
        nonlocal n_callbacks, best_ret
        model = _locals['self']
        n_callbacks += 1
        #total_steps = model.num_timesteps + (timesteps)*num_trains
        total_steps = n_callbacks * model.n_steps
        print("total steps: ", total_steps)

        # Saving best model
        if (total_steps) % eval_save_period == 0:
            start_eval_time = time.time()
            if is_save:
                ret, std, total_rets, state_history = evaluate(model, eval_env, render=False)
                #model.save(os.path.join(experiment_name, 'model_{}_{}.pkl'.format(total_steps, ret)))
                if ret > best_ret:
                    print("Saving new best model")
                    model.save(os.path.join(experiment_name, 'best_model_{}_{}.pkl'.format(total_steps, ret)))
                    best_ret = ret
                #wandb.log({"eval_ret": ret}, step=total_steps)
                state_history = list(state_history)
                line = [total_steps] + state_history
                with open(rets_path, "a", newline="") as f:
                    writer = csv.writer(f)
                    writer.writerow(line)
            else:
                ret, std, total_rets, _ = evaluate(model, eval_env, render=False)
        return True
Esempio n. 2
0
def main(_):
    repeat = 0
    maxRepeat = 3
    while repeat < maxRepeat:
        train()
        print('start evaluation...')
        evaluate()
        repeat += 1
Esempio n. 3
0
def dump_interpret(model_path, full_model, invasive_uniform, eval_bleu, dataset, include_train_subset, grad_bsize, calculate_grad):
    print('interpreting %s' % model_path)
    meta_stats = {}

    training_data, validation_data, vocab = load_dataset_by_name(dataset)

    pad_id = vocab.PieceToId("<pad>")
    bos_id = vocab.PieceToId("<s>")
    eos_id = vocab.PieceToId("</s>")

    val_data_manager = StateManager(validation_data, vocab, bos_id, eos_id, pad_id, device, model_config)
    train_data_manager = StateManager(training_data, vocab, bos_id, eos_id, pad_id, device, model_config)
    VOCAB_SIZE = vocab.GetPieceSize()

    model = Seq2seq(device=device, hidden_dim=HIDDEN_DIM, vocab_size=VOCAB_SIZE, num_layers=NUM_LAYERS, dropout=0,
                    attn_lambda=0.0, pad_id=pad_id, full_model=full_model, invasive_uniform=invasive_uniform).to(device)
    model.load_state_dict(torch.load(model_path))

    if not full_model:
        state_scores_val = get_state_scores(model, val_data_manager)
    else:
        state_scores_val = get_state_scores2(model, val_data_manager)
    if calculate_grad:
      grad_influence_val = get_grad_influence2(model, val_data_manager, grad_bsize)

    perplexity_val, acc_val, attn_val = evaluate_next_token(model, val_data_manager)
    meta_stats['val_acc'] = acc_val
    meta_stats['val_perplexity'] = perplexity_val

    if eval_bleu:
      bleu_val = evaluate(model, val_data_manager, method='beam')
      meta_stats['val_bleu'] = bleu_val

    if include_train_subset:
      random.seed(1)
      train_idxs = random.sample(range(len(train_data_manager.dataset)), k=len(val_data_manager.dataset))
      inverse_train_idx_map = {train_idxs[i]: i for i in range(len(train_idxs))}
      eval_train = StateManager([train_data_manager.dataset[idx] for idx in train_idxs], vocab, bos_id, eos_id, pad_id, device, model_config)
      if not full_model:
          state_scores_train = get_state_scores(model, eval_train)
      else:
          state_scores_train = get_state_scores2(model, eval_train)
      if calculate_grad:
        grad_influence_train = get_grad_influence2(model, eval_train, grad_bsize)

      perplexity_train, acc_train, attn_train = evaluate_next_token(model, eval_train)
      meta_stats['train_acc'] = acc_train
      meta_stats['train_perplexity'] = perplexity_train

      if eval_bleu:
        bleu_train = evaluate(model, eval_train, method='beam')
        meta_stats['train_bleu'] = bleu_train

    items = []
    for i in range(len(val_data_manager.dataset)):
        curr_dict = {}
        curr_dict['split'] = 'val'
        curr_dict['src'] = sentence2ids_nopad(val_data_manager, val_data_manager.dataset[i].src, additional_eos=False)
        curr_dict['trg'] = sentence2ids_nopad(val_data_manager, val_data_manager.dataset[i].trg, additional_eos=False)
        curr_dict['beta'] = state_scores_val[i]
        curr_dict['alpha'] = attn_val[i]
        if calculate_grad:
          curr_dict['grad'] = grad_influence_val[i]
        else:
          curr_dict['grad'] = []

        items.append(curr_dict)

    if include_train_subset:
      train_idxs_set = set(train_idxs)
      for i in range(len(train_data_manager.dataset)):
        curr_dict = {}
        curr_dict['split'] = 'train'
        curr_dict['src'] = sentence2ids_nopad(train_data_manager, train_data_manager.dataset[i].src, additional_eos=False)
        curr_dict['trg'] = sentence2ids_nopad(train_data_manager, train_data_manager.dataset[i].trg, additional_eos=False)
        if i in train_idxs_set:
          curr_dict['beta'] = state_scores_train[inverse_train_idx_map[i]]
          curr_dict['alpha'] = attn_train[inverse_train_idx_map[i]]
          if calculate_grad:
            curr_dict['grad'] = grad_influence_train[inverse_train_idx_map[i]]
          else:
            curr_dict['grad'] = []
        else:
          curr_dict['beta'] = None
          curr_dict['alpha'] = None
          curr_dict['grad'] = None

        items.append(curr_dict)

    return items, meta_stats
Esempio n. 4
0
def do_train_exp():
    print 'Training mp_lstm Network'
    print 'Experimental settins:'
    print 'NUMBER EPOCH: %d' % (CFG['NUM_EPOCH'])
    print 'BATCH SIZE: %d' % (CFG['BATCH_SIZE'])
    print 40 * '-'

    print 40 * '-'
    print 'build model...'
    model, exp_func = exp_model()
    print 'model ok'
    print 40 * '*'
    num_samples = len(CFG['TRAIN'])
    num_batch = 0  #num_samples / CFG['BATCH_SIZE']
    best_loss = np.inf
    acc = []
    best_m = 0.
    best_b = 0.
    #    print 'training...'
    for iepoch in np.arange(CFG['NUM_EPOCH']):

        epoch_loss = 0.
        epoch_acc = 0.
        epoch_att_loss = 0.
        epoch_sem_loss = 0.
        for ibatch in np.arange(num_batch):
            batch_idx = CFG['TRAIN'][ibatch * CFG['BATCH_SIZE']:(ibatch + 1) *
                                     CFG['BATCH_SIZE']]
            batch_data, batch_words, gt_words, mask, k_words = data.get_batch_data(
                batch_idx, feat_type='ResNet')
            predict_words = np.reshape(gt_words, (-1, ))
            #    print batch_data.shape
            #    print batch_words.shape
            #    print mask.shape
            #    print predict_words.shape
            # print k_words.shape
            #    word2sent([batch_words[0]])
            # label2word([k_words[0]])
            print 'forward and backward...'
            #            batch_loss,batch_acc,att,att_loss,sem_loss= exp_func['train func'](batch_data,batch_words,mask,predict_words,k_words)
            try:
                batch_loss, batch_acc, att, att_loss, sem_loss = exp_func[
                    'train func'](batch_data, batch_words, mask, predict_words,
                                  k_words)
            except Exception, e_data:
                print 'Found Exception'
                print e_data
                continue
            print '%d epoch %d batch: loss %f att loss: %f sem loss: %f acc %f' % (
                iepoch + 1, ibatch + 1, batch_loss, att_loss, sem_loss,
                batch_acc)
            print 'attention:'
            #            print att.shape
            print 40 * '*'
            print 'attention values from all frames at first step'
            print att[0, 0]
            print 40 * '*'
            print 'attention values from all time steps at first frame'
            print att[0, :, 0]
            epoch_loss += batch_loss
            epoch_acc += batch_acc
            epoch_att_loss += att_loss
            epoch_sem_loss += sem_loss
        num_batch += 1
        epoch_loss /= num_batch
        epoch_acc /= num_batch
        epoch_att_loss /= num_batch
        epoch_sem_loss /= num_batch
        train_acc = epoch_acc
        logfile = open(
            'logs/VIDCAP_ATT/log_train_' +
            time.strftime('%Y-%m-%d', time.localtime(time.time())), 'a+')
        print >> logfile, time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())
        ) + '\n epoch %d script train loss:%f att loss %f sem loss %f train acc:%f' % (
            iepoch + 1, epoch_loss, epoch_att_loss, epoch_sem_loss, train_acc)
        logfile.close()
        print 'mean batch loss: %f' % epoch_loss
        print 'mean batch att loss: %f' % epoch_att_loss
        print 'mean batch sem loss: %f' % epoch_sem_loss
        print 'mean batch acc: %f' % epoch_acc
        print 40 * '-'
        if epoch_loss < best_loss:
            print 'find better training result.'
            print 'saving model'
            net_params = lasagne.layers.get_all_param_values(
                model[model['net name']]['word_prob'])
            modelfile = open('../models/sta_fg_params.pkl', 'wb')
            cPickle.dump(net_params, modelfile)
            modelfile.close()
            if iepoch >= 0:
                print 'lets validating our model'
                eval_res = eval_model.evaluate(exp_func['sent prob'],
                                               exp_func['key word'], 'ResNet',
                                               'valid')
                logfile = open(
                    'logs/VIDCAP_ATT/log_train_' +
                    time.strftime('%Y-%m-%d', time.localtime(time.time())),
                    'a+')
                print >> logfile, time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime(
                        time.time())) + ' evaluation results\n'
                if eval_res['METEOR'] > best_m and eval_res['Bleu_4'] > best_b:
                    best_m = eval_res['METEOR']
                    best_b = eval_res['Bleu_4']
                    modelfile = open('../models/best_valided_stafg_params.pkl',
                                     'wb')
                    cPickle.dump(net_params, modelfile)
                    modelfile.close()
                    print >> logfile, 'find better model!!!'
                print >> logfile, 'validation results:'
                for metric, score in eval_res.items():
                    print >> logfile, '%s: %.3f' % (metric, score),
                print >> logfile, '\n'
                logfile.close()
Esempio n. 5
0
                    best_m = eval_res['METEOR']
                    best_b = eval_res['Bleu_4']
                    modelfile = open('../models/best_valided_stafg_params.pkl',
                                     'wb')
                    cPickle.dump(net_params, modelfile)
                    modelfile.close()
                    print >> logfile, 'find better model!!!'
                print >> logfile, 'validation results:'
                for metric, score in eval_res.items():
                    print >> logfile, '%s: %.3f' % (metric, score),
                print >> logfile, '\n'
                logfile.close()

    # testing model
    print 'training done! Testing model...'
    eval_res = eval_model.evaluate(exp_func['sent prob'], exp_func['key word'],
                                   'ResNet', 'test')
    logfile = open(
        'logs/VIDCAP_ATT/log_test_' +
        time.strftime('%Y-%m-%d', time.localtime(time.time())), 'a+')
    print >> logfile, time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(
                                        time.time())) + ' testing results\n'
    for metric, score in eval_res.items():
        print >> logfile, '%s: %.3f' % (metric, score),
    print >> logfile, '\n'
    logfile.close()
    print 'DONE'


def word2sent(wordids):
    print 'Captions:'
Esempio n. 6
0
    if args.uniform:
        config_name = 'h_dim=%d,dropout=%f,b_size=%d,seed=%d,uniform' % (
            HIDDEN_DIM, DROPOUT, batch_size, seed)
    else:
        config_name = 'h_dim=%d,dropout=%f,b_size=%d,seed=%d,normal' % (
            HIDDEN_DIM, DROPOUT, batch_size, seed)

    model_path = ("models/%s/%s/" % (dataset, config_name))
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    model = Seq2seq(device=device,
                    hidden_dim=HIDDEN_DIM,
                    vocab_size=VOCAB_SIZE,
                    num_layers=NUM_LAYERS,
                    dropout=DROPOUT,
                    attn_lambda=0.0,
                    pad_id=pad_id,
                    full_model=True,
                    invasive_uniform=args.uniform).to(device)
    train(model,
          num_epochs,
          batch_size,
          os.path.join(model_path, 'model'),
          custom_saves=custom_saves)

    model.load_state_dict(torch.load(os.path.join(model_path, 'model')))
    print("BLEU score with beam search ",
          evaluate(model, val_data_manager, method='beam'))
Esempio n. 7
0
def do_train_exp():
    print 'Training mp_lstm Network'
    print 'Experimental settins:'
    print 'NUMBER EPOCH: %d' % (CFG['NUM_EPOCH'])
    print 'BATCH SIZE: %d' % (CFG['BATCH_SIZE'])
    print 40 * '-'

    print 40 * '-'
    print 'build model...'
    model, exp_func = exp_model()
    print 'model ok'
    print 40 * '*'
    num_samples = len(CFG['TRAIN'])
    num_batch = num_samples / CFG['BATCH_SIZE']
    best_loss = np.inf
    acc = []
    print 'training...'
    #   eval_model.evaluate(exp_func['sent prob'])
    for iepoch in np.arange(CFG['NUM_EPOCH']):

        epoch_loss = 0.
        epoch_acc = 0.
        for ibatch in np.arange(num_batch):
            batch_idx = CFG['TRAIN'][ibatch * CFG['BATCH_SIZE']:(ibatch + 1) *
                                     CFG['BATCH_SIZE']]
            batch_data, batch_words, mask = data.get_batch_data(batch_idx)
            predict_words = np.reshape(batch_words, (-1, ))
            predict_words = predict_words[np.where(mask.flatten() == 1)]
            print batch_data.shape
            print batch_words.shape
            print mask.shape
            print predict_words.shape
            #            word2sent(batch_words)
            print 'forward and backward...'
            batch_loss, batch_acc = exp_func['train func'](batch_data,
                                                           batch_words, mask,
                                                           predict_words)

            print '%d epoch %d batch: loss %f acc %f' % (
                iepoch + 1, ibatch + 1, batch_loss, batch_acc)
            epoch_loss += batch_loss
            epoch_acc += batch_acc
        epoch_loss /= num_batch
        epoch_acc /= num_batch
        train_acc = epoch_acc
        logfile = open(
            'logs/VIDCAP_MP/log_msvd_mplstm_' +
            time.strftime('%Y-%m-%d', time.localtime(time.time())), 'a+')
        print >> logfile, time.strftime(
            '%Y-%m-%d %H:%M:%S', time.localtime(time.time())
        ) + '\n script train loss:%f train acc:%f' % (epoch_loss, train_acc)
        logfile.close()
        print 'mean batch loss: %f' % epoch_loss
        print 'mean batch acc: %f' % epoch_acc
        print 40 * '-'
        if epoch_loss < best_loss:
            print 'find better training result.'
            print 'saving model'
            net_params = lasagne.layers.get_all_param_values(
                model[model['net name']]['word_prob'])
            modelfile = open('../models/VIDCAP_MP/msvd_mplstm_params.pkl',
                             'wb')
            cPickle.dump(net_params, modelfile)
            modelfile.close()
            if iepoch >= 2:
                print 'lets validating our model'
                eval_model.evaluate(exp_func['sent prob'])
    for i in np.arange(len(acc)):
        print '%d epoch acc %f' % (i + 1, acc[i])
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--task_name', default='NER', type=str)
    parser.add_argument('--data_dir', default='./datasets/cluener', type=str)
    parser.add_argument('--model_type', default='bert', type=str)
    parser.add_argument('--display', default='./display', type=str)
    parser.add_argument('--pretrain_model_path',
                        default='./pretrained_model/bert-base-uncased/',
                        type=str,
                        required=False)
    parser.add_argument('--output_dir', default='./output/', type=str)
    parser.add_argument('--markup',
                        default='bios',
                        type=str,
                        choices=['bios', 'bio'])
    parser.add_argument('--loss_type',
                        default='ghmc',
                        choices=['lsr', 'focal', 'ce', 'ghmc'])
    parser.add_argument('--max_seq_length', default=128, type=int)
    parser.add_argument("--do_lower_case", default=True)
    parser.add_argument('--do_train', default=True)
    parser.add_argument('--do_eval', default=True)
    parser.add_argument('--do_predict', default=False)
    parser.add_argument('--per_gpu_train_batch_size', default=128, type=int)
    parser.add_argument('--gradient_accumulation_steps', type=int, default=1)
    parser.add_argument('--learning_rate', default=5e-5, type=float)
    parser.add_argument("--weight_decay", default=0.0, type=float)
    parser.add_argument('--adam_epsilon', default=1e-8, type=float)
    parser.add_argument('--max_grad_norm', default=1.0, type=float)
    parser.add_argument('--num_train_epochs', default=8.0, type=float)
    parser.add_argument('--warmup_steps', default=0, type=int)
    parser.add_argument('--logging_steps', type=int, default=50)
    parser.add_argument('--save_steps', type=int, default=50)
    parser.add_argument('--no_cuda', default=False)
    parser.add_argument('--overwrite_output_dir', default=True)
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--fp16', default=True)
    parser.add_argument('--fp16_opt_level', type=str, default="O1")
    parser.add_argument('--local_rank', type=int, default=-1)
    parser.add_argument("--eval_count", type=int, default=0)

    args = parser.parse_args()

    if not os.path.exists(args.display):
        os.mkdir(args.display)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    args.output_dir = args.output_dir + f'{args.model_type}'

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir
    ) and args.do_train and not args.overwrite_output_dir:
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device('cuda', args.local_rank)
        args.n_gpu = 1
    args.device = device

    seed_everything(args.seed)
    process = DataProcess()
    label_list = process.get_labels()

    args.id2label = {i: label for i, label in enumerate(label_list)}
    args.label2id = {label: i for i, label in enumerate(label_list)}
    num_labels = len(label_list)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()

    config_class, model_class, tokenizer_class = BertConfig, BertSpanForNer, CNerTokenizer
    config = config_class.from_pretrained(args.pretrain_model_path,
                                          num_labels=num_labels,
                                          loss_type=args.loss_type,
                                          soft_label=True)
    tokenizer = tokenizer_class.from_pretrained(
        args.pretrain_model_path, do_lower_case=args.do_lower_case)
    model = model_class.from_pretrained(args.pretrain_model_path,
                                        config=config)

    model.to(args.device)

    writer = SummaryWriter(log_dir=args.display + '/' +
                           time.strftime('%m_%d_%H.%M', time.localtime()) +
                           '_' + str(args.loss_type))

    if args.do_train:
        train_dataset = load_examples(args, tokenizer, data_type='train')
        global_step, train_loss = train(args, train_dataset, model, tokenizer,
                                        writer)

        model_to_save = (model.module if hasattr(model, "module") else model)
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_vocabulary(args.output_dir)

        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        tokenizer = tokenizer_class.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
        checkpoints = [args.output_dir]

        for checkpoint in checkpoints:
            global_step = checkpoint.split(
                "-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split(
                '/')[-1] if checkpoint.find('checkpoint') != -1 else ""
            model = model_class.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, writer)
            if global_step:
                result = {
                    "{}_{}".format(global_step, k): v
                    for k, v in result.items()
                }
            results.update(result)

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            for key in sorted(results.keys()):
                writer.write("{} = {}\n".format(key, str(results[key])))
Esempio n. 9
0
def train(args, train_dataset, model, tokenizer, writer):

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate_fn)

    train_total = len(
        train_dataloader
    ) // args.gradient_accumulation_steps * args.num_train_epochs
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=train_total)

    if os.path.isfile(os.path.join(
            args.pretrain_model_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.pretrain_model_path, "scheduler.pt")):
        optimizer.load_state_dict(
            torch.load(os.path.join(args.pretrain_model_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.pretrain_model_path, "scheduler.pt")))
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    print("***** Running training *****")

    global_step = 0
    steps_trained_in_current_epoch = 0

    if os.path.exists(args.pretrain_model_path
                      ) and "checkpoint" in args.pretrain_model_path:
        global_step = int(
            args.pretrain_model_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) //
                                         args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // args.gradient_accumulation_steps)

    train_loss, logging_loss = 0.0, 0.0
    model.zero_grad()

    for _ in range(int(args.num_train_epochs)):
        pbar = ProgressBar(n_total=len(train_dataloader), desc='Training')
        for step, batch in enumerate(train_dataloader):
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "start_positions": batch[3],
                "end_positions": batch[4]
            }

            inputs["token_type_ids"] = (batch[2] if args.model_type
                                        in ["bert"] else None)
            outputs = model(**inputs)
            loss = outputs[0]

            writer.add_scalar("Train_loss", loss.item(), step)

            if args.n_gpu > 1:
                loss = loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            pbar(step, {'loss': loss.item()})
            train_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                scheduler.step()
                optimizer.step()
                model.zero_grad()
                global_step += 1
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    if args.local_rank == -1:
                        evaluate(args, model, tokenizer, writer)
                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir): os.makedirs(output_dir)
                    model_to_save = (model.module
                                     if hasattr(model, "module") else model)
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    tokenizer.save_vocabulary(output_dir)
                    print("Saving model checkpoint to %s", output_dir)
                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))

        print(" ")
        if 'cuda' in str(args.device):
            torch.cuda.empty_cache()
    return global_step, train_loss / global_step