Exemple #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='gpt2',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('{} is on use...'.format(device))
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name,
                                              special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    # model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
    model = GPT2DoubleHeadsModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    #     GPT2DoubleHeadsModel.set_num_special_tokens(model, len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = len(
            train_dataloader) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss = model(input_ids, mc_token_ids, lm_labels,
                                   mc_labels)
                _, mc_logits = model(input_ids, mc_token_ids)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--output_dir", default='tuned_gpt2', type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--train_dataset', type=str, default='')
    
    parser.add_argument('--source_eval', type=str, default='')
    parser.add_argument('--target_eval', type=str, default='')
    parser.add_argument('--source_train', type=str, default='')
    parser.add_argument('--target_train', type=str, default='')
    
    
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=10)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--effective_batch_size',type=int, default=64)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)
    parser.add_argument('--bsz', type=int, default = 20)
    parser.add_argument('--bptt', type=int, default = 40)

    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
#    print(args)

    model_type = 'gpt2'


    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

#    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device(type='cuda')
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

#    if not args.do_train and not args.do_eval:
#        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
        

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2').to('cuda')

    model.to(device)


    #file_train = args.train_dataset #'cnn_train.txt'
    #file_eval =  args.eval_dataset #'cnn_valid.txt'
    bptt = args.bptt
    bsz = args.bsz
    

#    X_eval, nbatch_eval = load_dataset(file_eval, tokenizer, bptt, bsz)
#    X_train, nbatch_train =  load_dataset(file_train, tokenizer, bptt, bsz)
    
    batches_eval, labels_eval, nbatch_eval = load_dataset(args.source_eval, args.target_eval, tokenizer, bptt, bsz)
    batches_train, labels_train, nbatch_train =  load_dataset(args.source_train, args.target_train, tokenizer, bptt, bsz)
    
    

    # Prepare optimizer
#    param_optimizer = list(model.parameters())
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    
    print('here 3')
#    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
    num_train_optimization_steps = nbatch_train * args.num_train_epochs
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    eval_loss_min = None
    print('here 4')
    model.to(device)

    nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
    model.train()
    for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"):
        tr_loss = 0
        nb_tr_steps = 0
        
        for i_batch in tqdm(list(range(nbatch_train)), desc='Evaluating epoch {}'.format(epoch_i)):
            batch = batches_train[i_batch]#X_train[:, i_batch*bsz:(1+i_batch)*bsz].permute(1,0)
            
            batch = batch.cuda()
            lm_labels = labels_train[i_batch].cuda()
            if batch.numel() == 0:
                break
            
            #loss = model(batch, lm_labels = labels_train[i_batch].cuda())
                            # TRY DOING IT MANUALLY
            loss_fct = CrossEntropyLoss(reduction = 'none')
            lm_logits,_ = model(batch)
            shift_logits = lm_logits[:, :-1, :].contiguous()
            shift_labels = batch[:,1:].contiguous()
            
            shift_labels_mask = (lm_labels[:,1:].contiguous().view(-1) != -1).float()
            
            loss_mat = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                        shift_labels.view(-1))
            loss = (loss_mat*shift_labels_mask).view(-1).sum()/shift_labels_mask.sum() # avg over non-masked indices
            
            loss.backward()
            
            # only step the model if you've gone through 'effective_batch_size' examples
            if (i_batch*args.train_batch_size) % args.effective_batch_size == 0 and i_batch != 0:
                optimizer.step()
                optimizer.zero_grad()
                
            tr_loss += loss.item()
            

            exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
            nb_tr_steps += 1
         
            
            
            ###
            # Evaluations
            ###
            
            
            if i_batch % 1000 == 0: # get eval score
                eval_loss = eval_model(model, nbatch_eval,batches_eval,labels_eval, bsz)
                
                # if eval_loss improves, save model
                if eval_loss_min is None or eval_loss < eval_loss_min:
                    eval_loss_min = eval_loss
                    
                    # save model if eval loss is lower
                    model_to_save = model
                    # If we save using the predefined names, we can load using `from_pretrained`
                    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
                    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        
                    torch.save(model_to_save.state_dict(), output_model_file)
                    to_json_file(model_to_save.config,output_config_file)
                
                print('eval_loss {}',format(eval_loss))
                model.train()
                
            if i_batch % 200 == 0: # try generating from model 
                print("Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0]))

                model.eval()
                if model_type == 'gpt':
                    encode = lambda a: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(a))
                    decode = tokenizer.decode
                elif model_type == 'gpt2':
                    encode = tokenizer.encode
                    decode = tokenizer.decode
                
                generate_from_model(encode, decode, model = model,model_type = model_type)
                model.train()
Exemple #3
0
def train_model(epochs=10,
                num_gradients_accumulation=4,
                batch_size=8,
                gpu_id=0,
                lr=1e-4,
                load_dir='decoder_model',
                decoder_model='original_pretrained_model_for_bertGPT.pth'):
    # make sure your model is on GPU
    device = torch.device(f"cuda:{gpu_id}")

    #------------------------LOAD MODEL-----------------
    print('load the model....')
    model = BertGPT()
    model.load_state_dict(torch.load(decoder_model))
    # model = nn.DataParallel(model, device_ids = [0])
    model = model.to(device)
    print('load success')
    #------------------------END LOAD MODEL--------------

    #------------------------LOAD TRAIN DATA------------------
    train_data = torch.load("train_data.pth")
    train_dataset = MyDataset(*train_data)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  shuffle=True,
                                  batch_size=batch_size,
                                  num_workers=2,
                                  collate_fn=collate_fn)
    val_data = torch.load("validate_data.pth")
    val_dataset = MyDataset(*val_data)
    val_dataloader = DataLoader(dataset=val_dataset,
                                shuffle=True,
                                batch_size=batch_size,
                                num_workers=2,
                                collate_fn=collate_fn)
    #------------------------END LOAD TRAIN DATA--------------

    #------------------------SET OPTIMIZER-------------------
    num_train_optimization_steps = len(
        train_dataset) * epochs // batch_size // num_gradients_accumulation

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in param_optimizer
            if not any(nd in n for nd in no_decay) and p.requires_grad
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in param_optimizer
            if any(nd in n for nd in no_decay) and p.requires_grad
        ],
        'weight_decay':
        0.0
    }]
    print('train')
    print(len(optimizer_grouped_parameters[0]['params']))

    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=lr,
                           warmup=0.01,
                           max_grad_norm=1.0,
                           weight_decay=0.01,
                           t_total=num_train_optimization_steps)
    #------------------------END SET OPTIMIZER--------------

    #------------------------START TRAINING-------------------
    update_count = 0

    start = time.time()
    print('start training....')
    for epoch in range(epochs):
        #------------------------training------------------------
        model.train()
        losses = 0
        times = 0
        for batch in tqdm(train_dataloader, desc='dirs'):
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            logits = model(encoder_input, mask_encoder_input, decoder_input,
                           mask_decoder_input)

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()
            loss = util.sequence_cross_entropy_with_logits(out,
                                                           target,
                                                           target_mask,
                                                           average="token")
            loss.backward()

            losses += loss.item()
            times += 1

            update_count += 1

            if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
                optimizer.step()
                optimizer.zero_grad()
        end = time.time()
        print('-' * 20 + f'epoch {epoch}' + '-' * 20)
        print(f'time: {(end - start)}')
        print(f'loss: {losses / times}')
        start = end

        #------------------------validate------------------------
        model.eval()

        perplexity = 0
        batch_count = 0
        print('start calculate the perplexity....')

        with torch.no_grad():
            for batch in tqdm(val_dataloader):
                batch = [item.to(device) for item in batch]
                encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

                logits = model(encoder_input, mask_encoder_input,
                               decoder_input, mask_decoder_input)

                out = logits[:, :-1].contiguous()
                target = decoder_input[:, 1:].contiguous()
                target_mask = mask_decoder_input[:, 1:].contiguous()

                loss = util.sequence_cross_entropy_with_logits(out,
                                                               target,
                                                               target_mask,
                                                               average="token")
                perplexity += np.exp(loss.item())
                batch_count += 1

        print(f'validate perplexity: {perplexity / batch_count}')

        direct_path = os.path.join(os.path.abspath('.'), load_dir)
        if not os.path.exists(direct_path):
            os.mkdir(direct_path)

        torch.save(model.state_dict(),
                   os.path.join(direct_path,
                                str(epoch) + "model.pth"))
Exemple #4
0
def main():
    # Parse the arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=1)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)
    parser.add_argument('--max_seq_length', type=int, default=110)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Set the seed for random, numpy, PyTorch
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned
    special_tokens = ['<POS>', '<NEG>', '<CON_START>', '<START>', '<END>']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    start_token_id = tokenizer.convert_tokens_to_ids(['<START>'])[0]
    model = OpenAIGPTLMHeadModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Load and encode dataset
    def tokenize_and_encode(file_path):
        '''
        This method tokenizes the input data and encodes it using the OpenAIGPTTokenizer
        :param file_path: Path of the input file, dtype: str
        :return: encoded dataset  dtype: list
        '''
        with open(file_path, 'r') as in_fp:
            lines = in_fp.read().splitlines()

        tokenized_dataset = lines
        for i, line in enumerate(tqdm(lines)):
            token = tokenizer.tokenize(line)[:512]
            tokenized_dataset[i] = tokenizer.convert_tokens_to_ids(token)
        return tokenized_dataset

    logger.info("Encoding dataset...")
    train_dataset = tokenize_and_encode(args.train_dataset)
    eval_dataset = tokenize_and_encode(args.eval_dataset)
    print("Training samples = {}".format(len(train_dataset)))
    print("Validation samples = {}".format(len(eval_dataset)))
    print("Example = {}".format(train_dataset[0]))
    time.sleep(2)
    # Compute the mex input length for the Transformer
    train_dataset = [
        x for x in train_dataset
        if len(x) <= args.max_seq_length and start_token_id in x
    ]  # Remove all sentence longer than max_seq_length
    eval_dataset = [
        x for x in eval_dataset
        if len(x) <= args.max_seq_length and start_token_id in x
    ]
    input_length = max(max(len(t) for t in train_dataset),
                       max(len(q) for q in eval_dataset))
    if n_gpu > 1:
        input_length = min(input_length, model.module.config.n_positions)
    else:
        input_length = min(input_length, model.config.n_positions
                           )  # Max size of input for the pre-trained model
    print("Input Length = {}".format(input_length))

    def pre_process_dataset(encoded_dataset, input_length, start_token_id):
        """
        This method is to create torch tensor of input ids and lm labels
        :param encoded_dataset: Input dataset, dtype: list
        :param input_length: Maximum length of sentence from training and eval dataset, dtype: int
        :param start_token_id: id of the '<START>' token, dtype: int
        :return: torch.tensor of size [len(encoded_dataset), 2]
        """

        n_batch = len(encoded_dataset)
        input_ids = np.zeros(shape=(n_batch, input_length), dtype=np.int64)
        lm_labels = np.full(shape=(n_batch, input_length),
                            fill_value=-1,
                            dtype=np.int64)

        for i, tokens in enumerate(encoded_dataset):
            try:
                #tokens = tokens[:input_length]
                start_id_index = tokens.index(start_token_id)
                input_ids[i, :len(tokens)] = tokens
                start_id_index = tokens.index(start_token_id)
                lm_labels[i, start_id_index:len(tokens) -
                          1] = tokens[start_id_index + 1:len(tokens)]
                # LM loss calculate only for tokens after <START> token in the sentence
                #lm_labels[i, :len(tokens)-1] = tokens[1:]
            except ValueError:
                print("Index {} doesn't have start token".format(i))

        input_ids = torch.tensor(input_ids)
        lm_labels = torch.tensor(lm_labels)
        tensor_dataset = (input_ids, lm_labels)
        #tensor_dataset.append(torch.tensor(d) for d in all_inputs)

        return tensor_dataset

    # Prepare input tensors and dataloders
    train_tensor_dataset = pre_process_dataset(train_dataset,
                                               input_length,
                                               start_token_id=start_token_id)
    eval_tensor_dataset = pre_process_dataset(eval_dataset,
                                              input_length,
                                              start_token_id=start_token_id)

    print("Training Example Input ids= {}".format(train_tensor_dataset[0][0]))
    print("Training Example Language Modeling ids = {}".format(
        train_tensor_dataset[1][0]))
    time.sleep(10)
    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = RandomSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(
        train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, lm_labels = batch
                loss = model(input_ids, lm_labels=lm_labels)
                if n_gpu > 1:
                    loss.mean().backward()
                else:
                    loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                if n_gpu > 1:
                    tmp_loss = loss.mean().item()
                else:
                    tmp_loss = loss.item()
                exp_average_loss = tmp_loss if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * tmp_loss
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(
                args.output_dir,
                "pytorch_model_zero_grad_{}.bin".format(epoch + 1))
            config = model.module.config if hasattr(model,
                                                    'module') else model.config
            torch.save(model_to_save.state_dict(), output_model_file)

            model_state_dict = torch.load(output_model_file)
            model = OpenAIGPTLMHeadModel(config)
            model.load_state_dict(model_state_dict)
            model.to(device)
            if n_gpu > 1:
                model = torch.nn.DataParallel(model)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, lm_labels = batch
            with torch.no_grad():
                lm_loss = model(input_ids, lm_labels=lm_labels)

            eval_loss += lm_loss.mean().item()

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss, 'train_loss': train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default='/hdd/user4/gpt_classification/dataset/ag_news',
                        type=str,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--task_name",
                        default='ag_news',
                        type=str,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default='/hdd/user4/gpt_classification/experiment/ag_news',
                        type=str,
                        help="The output directory where the model predictions and checkpoints will be written.")

    parser.add_argument("--max_grad_norm",
                        default=1)
    parser.add_argument('--weight_decay', type=float, default=0.0)

    ## Other parameters
    parser.add_argument("--cache_dir",
                        default='/hdd/user4/gpt_classification/pretrained',
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=True,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=9.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        default=True,
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        # n_gpu = torch.cuda.device_count()
        n_gpu = 1
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    args.device = device

    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name](args.data_dir)
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
    model = OpenAIGPTForClassification.from_pretrained(args.model_name,
                                                       num_special_tokens=len(special_tokens),
                                                       num_labels=num_labels)
    if args.local_rank == 0:
        torch.distributed.barrier()

    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    tr_loss = 0

    if args.do_train:
        if args.local_rank in [-1, 0]:
            tb_writer = SummaryWriter()

        # Prepare data loader
        train_examples = processor.get_train_examples()
        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
            list(filter(None, args.model_name.split('/'))).pop(),
            str(args.max_seq_length),
            str(task_name)))
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s", cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        model.train()
        for _ in range(int(args.num_train_epochs)):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, _, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model.forward(input_ids, input_mask)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                    if args.local_rank in [-1, 0]:
                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                        tb_writer.add_scalar('loss', loss.item(), global_step)

        tb_writer.close()

    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)
        # Good practice: save your training arguments together with the trained model
        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
        torch.save(args, output_args_file)

    # Load a trained model and vocabulary that you have fine-tuned
    model = OpenAIGPTForClassification.from_pretrained(args.output_dir,
                                                       num_labels=num_labels)

    model.to(device)

    ### Evaluation
    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples()
        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
            list(filter(None, args.model_name.split('/'))).pop(),
            str(args.max_seq_length),
            str(task_name)))
        try:
            with open(cached_eval_features_file, "rb") as reader:
                eval_features = pickle.load(reader)
        except:
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
                with open(cached_eval_features_file, "wb") as writer:
                    pickle.dump(eval_features, writer)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)

        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        out_label_ids = None


        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model.forward(input_ids, input_mask)

            # create eval loss and other metric required by the task
            if output_mode == "classification":
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
            elif output_mode == "regression":
                loss_fct = MSELoss()
                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
                out_label_ids = label_ids.detach().cpu().numpy()
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = preds[0]
        if output_mode == "classification":
            output_odp = []
            for arr in preds:
                t = (-arr).argsort()[:5]
                output_odp.append(t.tolist())
            file_path = 'D:/바탕화면/(논문)multi-pretraining/NYT'
            with open('gpt_top5.pkl','wb') as f:
                pickle.dump(output_odp,f)


            preds = np.argmax(preds, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(task_name, preds, out_label_ids)
        print('preds:',preds,'label:',out_label_ids)

        loss = tr_loss / global_step if args.do_train else None

        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        result['loss'] = loss

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        # hack for MNLI-MM
        if task_name == "mnli":
            task_name = "mnli-mm"
            processor = processors[task_name]()

            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
            if not os.path.exists(args.output_dir + '-MM'):
                os.makedirs(args.output_dir + '-MM')

            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []
            out_label_ids = None

            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)

                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
                    out_label_ids = label_ids.detach().cpu().numpy()
                else:
                    preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)
                    out_label_ids = np.append(
                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

            eval_loss = eval_loss / nb_eval_steps
            preds = preds[0]
            preds = np.argmax(preds, axis=1)
            result = compute_metrics(task_name, preds, out_label_ids)

            loss = tr_loss / global_step if args.do_train else None

            result['eval_loss'] = eval_loss
            result['global_step'] = global_step
            result['loss'] = loss

            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
Exemple #6
0
def train_model(epochs=10,
                num_gradients_accumulation=4,
                batch_size=4,
                gpu_id=0,
                lr=1e-5,
                load_dir='decoder_model'):
    # make sure your model is on GPU
    device = torch.device(f"cuda:{gpu_id}")

    #------------------------LOAD MODEL-----------------
    print('load the model....')
    encoder = TransformerEncoder()
    decoder = TransformerDecoderLM()

    encoder.load_state_dict(torch.load("encoder.pth"))
    decoder.load_state_dict(torch.load("decoder.pth"))

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    print('load success')
    #------------------------END LOAD MODEL--------------

    #------------------------LOAD TRAIN DATA------------------
    train_data = torch.load("train_data.pth")
    train_dataset = TensorDataset(*train_data)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  shuffle=True,
                                  batch_size=batch_size)
    val_data = torch.load("validate_data.pth")
    val_dataset = TensorDataset(*val_data)
    val_dataloader = DataLoader(dataset=val_dataset,
                                shuffle=True,
                                batch_size=batch_size)
    #------------------------END LOAD TRAIN DATA--------------

    #------------------------SET OPTIMIZER-------------------
    num_train_optimization_steps = len(
        train_dataset) * epochs // batch_size // num_gradients_accumulation

    param_optimizer = list(decoder.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=lr,
                           warmup=0.01,
                           max_grad_norm=1.0,
                           weight_decay=0.01,
                           t_total=num_train_optimization_steps)
    #------------------------END SET OPTIMIZER--------------

    #------------------------START TRAINING-------------------
    update_count = 0

    start = time.time()
    print('start training....')
    for epoch in range(epochs):
        #------------------------training------------------------
        decoder.train()
        losses = 0
        times = 0
        for batch in train_dataloader:
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            _, past = encoder(encoder_input, mask_encoder_input)

            mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1)
            logits, _ = decoder(decoder_input, mask, past=past, past_length=0)

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = util.sequence_cross_entropy_with_logits(out,
                                                           target,
                                                           target_mask,
                                                           average="token")
            loss.backward()

            losses += loss.item()
            times += 1

            update_count += 1

            if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
                optimizer.step()
                optimizer.zero_grad()
        end = time.time()
        print('-' * 20 + f'epoch {epoch}' + '-' * 20)
        print(f'time: {(end - start)}')
        print(f'loss: {losses / times}')
        start = end

        #------------------------validate------------------------
        decoder.eval()

        perplexity = 0
        batch_count = 0
        print('start calculate the perplexity....')

        with torch.no_grad():
            for batch in val_dataloader:
                batch = [item.to(device) for item in batch]

                encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

                _, past = encoder(encoder_input, mask_encoder_input)

                mask = torch.cat([mask_encoder_input, mask_decoder_input],
                                 dim=1)
                logits, _ = decoder(decoder_input,
                                    mask,
                                    past=past,
                                    past_length=0)

                out = logits[:, :-1].contiguous()
                target = decoder_input[:, 1:].contiguous()
                target_mask = mask_decoder_input[:, 1:].contiguous()

                loss = util.sequence_cross_entropy_with_logits(out,
                                                               target,
                                                               target_mask,
                                                               average="token")
                perplexity += np.exp(loss.item())
                batch_count += 1

        print(f'validate perplexity: {perplexity / batch_count}')

        torch.save(
            decoder.state_dict(),
            os.path.join(os.path.abspath('.'), load_dir,
                         str(epoch) + "decoder.pth"))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument('--task', type=str, default='intent',
                        choices=['intent', 'slot'], help="Intent or slot prediction")
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.0)
    parser.add_argument('--probabilistic_masks', action='store_true')
    parser.add_argument('--attn_bias', action='store_true')
    parser.add_argument('--linearize', action='store_true')
    args = parser.parse_args()
    print(args)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    label_list = list()
    for line in open(LABEL_FILES[args.task]):
        label_list.append(line.strip())

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTDoubleHeadsClsModel.from_pretrained(args.model_name, num_labels=len(label_list), num_special_tokens=len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        elif isinstance(obj, np.ndarray):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_atis_dataset(args.train_dataset, label_list, tokenizer, args.probabilistic_masks, args.linearize)
    eval_dataset = load_atis_dataset(args.eval_dataset, label_list, tokenizer, args.probabilistic_masks, args.linearize, plot=False)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions - 2
    input_length = max(len(utt[:max_length]) + 2  \
                           for dataset in encoded_datasets for utt, _, _, _, _ in dataset)
    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids, len(label_list))
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        results = []
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None)
                # loss = args.lm_coef * losses[0] + losses[1]
                loss = losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])

            model.eval()
            eval_loss = 0
            nb_eval_steps, nb_eval_examples = 0, 0
            all_logits, all_labels = [], []
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch
                with torch.no_grad():
                    _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None)
                    _, mc_logits = model(input_ids, mc_token_ids, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None)

                mc_logits = mc_logits.detach().cpu().numpy()
                mc_labels = mc_labels.to('cpu').numpy()

                eval_loss += mc_loss.mean().item()
                all_logits.append(mc_logits)
                all_labels.append(mc_labels)

                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            all_logits = np.concatenate(all_logits, axis=0)
            all_labels = np.concatenate(all_labels, axis=0)
            eval_f1 = f1(all_logits, all_labels)
            eval_acc = accuracy(all_logits, all_labels) / nb_eval_examples
            train_loss = tr_loss/nb_tr_steps if args.do_train else None
            result = {'eval_loss': eval_loss,
                      'eval_f1': eval_f1,
                      'eval_accuracy': eval_acc,
                      'train_loss': train_loss}
            print(result)
            results.append(result)

        with open(os.path.join(args.output_dir, "log.csv"), "w") as csvfile:
            writer = csv.DictWriter(
                csvfile,
                ["train_loss", "eval_loss", "eval_accuracy", "eval_f1"]
            )
            writer.writeheader()
            writer.writerows(results)

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsClsModel.from_pretrained(
            args.output_dir,num_labels=len(label_list), num_special_tokens=len(special_tokens))
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss = 0
        nb_eval_steps, nb_eval_examples = 0, 0
        all_logits, all_labels = [], []
        fw = open("prediction.txt", "w")
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None)
                _, mc_logits = model(input_ids, mc_token_ids, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()

            for i, (o, l) in enumerate(zip((mc_logits>=0.5).astype(np.int32), mc_labels.astype(np.int32))):
                # if np.any(o != l):
                # pred = [label_list[idx] for idx, val in enumerate(o) if val == 1]
                # true = [label_list[idx] for idx, val in enumerate(l) if val == 1]
                pred = o
                true = l
                fw.write(f"{eval_dataset[nb_eval_examples+i][0]}\n{pred}\n{true}\n\n")

            eval_loss += mc_loss.mean().item()
            all_logits.append(mc_logits)
            all_labels.append(mc_labels)

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        fw.close()
        eval_loss = eval_loss / nb_eval_steps
        all_logits = np.concatenate(all_logits, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        eval_f1 = f1(all_logits, all_labels)
        eval_acc = accuracy(all_logits, all_labels) / nb_eval_examples
        train_loss = tr_loss/nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss,
                  'eval_f1': eval_f1,
                  'eval_accuracy': eval_acc,
                  'train_loss': train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Exemple #8
0
        elif i > MAX:
            pass

        else:
            #optimizer.zero_grad()
            y_pred = model(x_batch.to(device),
                           attention_mask=(x_batch > 0).to(device),
                           labels=None)
            #loss =  F.binary_cross_entropy_with_logits(y_pred,y_batch.to(device))
            loss = custom_loss(y_pred, y_batch.to(device))

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            if (i + 1
                ) % accumulation_steps == 0:  # Wait for several backward steps
                optimizer.step()  # Now we can do an optimizer step
                optimizer.zero_grad()
            if lossf:
                lossf = 0.98 * lossf + 0.02 * loss.item()
            else:
                lossf = loss.item()
            tk0.set_postfix(loss=lossf)
            avg_loss += loss.item() / len(train_loader)
            avg_accuracy += torch.mean(
                ((torch.sigmoid(y_pred[:, 0]) > 0.5)
                 == (y_batch[:, 0] > 0.5).to(device)).to(
                     torch.float)).item() / len(train_loader)

            if i % 1000 == 0:
                print('saving model checkpoint at iteration={}'.format(i))
                #torch.save(model.module.state_dict(), '{}/bert_large_fold_{}_epoch_2_ckpt_iteration_{}.bin'.format(OUTDIR, I, i))
    model_A.train()
    # model_B.train()
    
    for batch in pbar:
        batch = batch[0]
        # without relative position
        # if sum([len(item) for item in batch[1]]) > 1024:
        #     continue
            
        record_loss = train_one_iter(batch, update_count, fp16=False)
        
        update_count += 1

        if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
            # update for gradient accumulation
            optimizer.step()
            optimizer.zero_grad()
            
            # speed measure
            end = time.time()
            speed = batch_size * num_gradients_accumulation / (end - start)
            start = end
            
            # show progress
            pbar.set_postfix(loss=record_loss, speed=speed)

    "Evaluation"
    model_A.eval()
    # model_B.eval()
    val_acc, val_f1 = validate(val_dataloader)
    print(f"val f1: {val_f1}, valid acc: {val_acc}")
Exemple #10
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name',
        type=str,
        default='openai-gpt',
        help='pretrained model name or path to local checkpoint')
    parser.add_argument('--setting', type=str, default='explain_predict')
    parser.add_argument('--eval_preds_prefix', type=str, default='preds_')
    parser.add_argument("--n_train_print", type=int, default=10)
    parser.add_argument("--n_gen", type=int, default=20)
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument("--do_eval_train",
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=10)
    parser.add_argument('--num_eval_print', type=int, default=15)
    parser.add_argument('--train_batch_size', type=int, default=36)
    parser.add_argument('--eval_batch_size', type=int, default=60)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=1e-6)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--data',
                        type=str,
                        default='/stage/examples/commonsenseqa/')

    args = parser.parse_args()
    print(args)

    if args.batch_size == -1:
        args.batch_size = 1

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval and not args.do_test:
        raise ValueError(
            "At least one of `do_train` or `do_eval`  or do_test must be True."
        )

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    special_tokens = [
        '_start_</w>', 'or</w>', '_answer_</w>', '_classify_</w>', '_end_</w>'
    ]
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTLMHeadModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)

    datasets = parse_cqa(args.data, args.setting)
    numericalized = [
        CommonsenseExample.numericalize_list(
            CommonsenseExample.tokenize_list(d, tokenizer), tokenizer)
        for d in datasets
    ]

    tensor_datasets = pre_process_datasets(numericalized, *special_tokens_ids)

    #    train_tensor_dataset, eval_tensor_dataset, test_tensor_dataset = tensor_datasets[0], tensor_datasets[1], tensor_datasets[2]
    train_sampler, train_data = None, None
    if args.do_train or args.do_eval_train:
        train_tensor_dataset = tensor_datasets[0]
        train_data = TensorDataset(*train_tensor_dataset)
        train_sampler = RandomSampler(train_data)
        if args.do_eval_train:
            train_sampler = SequentialSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

    if args.do_eval:
        if args.do_eval_train:
            eval_data = train_data
            eval_sampler = train_sampler
        else:
            eval_tensor_dataset = tensor_datasets[1]
            eval_data = TensorDataset(*eval_tensor_dataset)
            eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

    if args.do_test:
        test_tensor_dataset = tensor_datasets[-1]
        test_data = TensorDataset(*test_tensor_dataset)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = len(
            train_data) * args.num_train_epochs // args.train_batch_size
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

    def trim_unks(x):
        try:
            unk_id = x.index('_end_</w>')
            return x[:unk_id]
        except:
            return x

    def detokenize(x):
        y = ''.join(trim_unks(x))
        y = y.replace('</w>', ' ')
        y = y.replace(' .', '.')
        y = y.replace(' ,', ',')
        y = y.replace(' ?', '?')
        y = y.replace(' !', '!')
        y = y.replace(' \' ', '\'')
        y = y.replace(' \'re', '\'re')
        y = y.replace(' \'s', '\'s')
        y = y.replace(' n\'t', 'n\'t')
        return y

    def detok_batch(x):
        if not isinstance(x, list):
            x = x.tolist()
        return [
            detokenize(
                tokenizer.convert_ids_to_tokens([z for z in y if z >= 0]))
            for y in x
        ]

    if args.do_train:
        best_eval = 0
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in range(int(args.num_train_epochs)):
            tr_loss, train_ppl, n_train_examples = 0, 0, 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            train_pred_strs, train_lab_strs = [], []
            for step, batch in enumerate(tqdm_bar):
                inputs = batch[0].to(device)
                labels = batch[1].to(device)
                loss = model(inputs, lm_labels=labels)
                train_ppl += loss.item() * inputs.size(0)
                n_train_examples += inputs.size(0)
                loss.backward()
                optimizer.step()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                if args.n_train_print > 0:
                    with torch.no_grad():
                        preds = sample(model, batch[2], 10, device)

                    pred_str = detok_batch(preds)
                    label_str = detok_batch(labels)
                    train_lab_strs.extend(label_str)
                    train_pred_strs.extend(pred_str)
                    input_str = detok_batch(inputs)
                    for print_idx in range(
                            min(args.n_train_print, inputs.size(0))):
                        print('INPT: ', input_str[print_idx])
                        print('GOLD: ', label_str[print_idx])
                        print('PRED: ', pred_str[print_idx])
                        print()

            train_bleu = None
            if args.n_train_print > 0:
                train_bleu = computeBLEU(train_pred_strs,
                                         [[x] for x in train_lab_strs])
                train_ppl = math.exp(train_ppl / n_train_examples)

            if args.do_eval:
                model.eval()
                eval_loss, eval_em, eval_ppl = 0, 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                label_strs, prediction_strs = [], []
                n_words = 0
                for batch in eval_dataloader:
                    inputs = batch[0].to(device)
                    labels = batch[1].to(device)

                    with torch.no_grad():
                        loss = model(inputs, lm_labels=labels)
                        preds = sample(model, batch[2], args.n_gen, device)

                    eval_loss += loss.item()
                    eval_ppl += loss.item() * inputs.size(0)
                    nb_eval_examples += inputs.size(0)
                    nb_eval_steps += 1
                    pred_str = detok_batch(preds)
                    label_str = detok_batch(labels)
                    label_strs.extend(label_str)
                    prediction_strs.extend(pred_str)
                    input_str = detok_batch(inputs)
                    eval_em += sum(
                        [x == y for x, y in zip(pred_str, label_str)])
                    for print_idx in range(
                            min(inputs.size(0), args.num_eval_print)):
                        print('INPT: ', input_str[print_idx])
                        print('GOLD: ', label_str[print_idx])
                        print('PRED: ', pred_str[print_idx])
                        print()

                eval_bleu = computeBLEU(prediction_strs,
                                        [[x] for x in label_strs])
                eval_ppl = math.exp(eval_ppl / nb_eval_examples)
                eval_em = eval_em / nb_eval_examples
                eval_loss = eval_loss / nb_eval_steps
                train_loss = tr_loss / nb_tr_steps if args.do_train else None
                result = {
                    'eval_loss': eval_loss,
                    'eval_em': eval_em,
                    'eval_bleu': eval_bleu,
                    'eval_ppl': eval_ppl,
                    'train_loss': train_loss,
                    'train_bleu': train_bleu,
                    'train_ppl': train_ppl
                }

                output_eval_file = os.path.join(args.output_dir,
                                                "eval_results.txt")
                with open(output_eval_file, "a") as writer:
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))

                if eval_bleu > best_eval:
                    best_eval = eval_bleu

                    # Save a trained model
                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_model_file = os.path.join(args.output_dir,
                                                     "pytorch_model.bin")
                    config = model.config
                    torch.save(model_to_save.state_dict(), output_model_file)

    if args.do_eval:
        # Load a trained model that you have fine-tuned
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTLMHeadModel(model.config)
        model.load_state_dict(model_state_dict)
        # uncomment to try out the default not finue-tuned model
        #        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens), cache_dir=os.path.dirname(args.data))
        model.to(device)
        model.eval()
        eval_loss, eval_em, eval_ppl = 0, 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        label_strs, prediction_strs = [], []
        n_words = 0
        for batch in eval_dataloader:
            inputs = batch[0].to(device)
            labels = batch[1].to(device)

            with torch.no_grad():
                loss = model(inputs, lm_labels=labels)
                preds = sample(model, batch[2], args.n_gen, device)

            eval_loss += loss.item()
            eval_ppl += loss.item() * inputs.size(0)
            nb_eval_examples += inputs.size(0)
            nb_eval_steps += 1
            pred_str = detok_batch(preds)
            label_str = detok_batch(labels)
            label_strs.extend(label_str)
            prediction_strs.extend(pred_str)
            input_str = detok_batch(inputs)
            eval_em += sum([x == y for x, y in zip(pred_str, label_str)])
            for print_idx in range(min(inputs.size(0), args.num_eval_print)):
                print('INPT: ', input_str[print_idx])
                print('GOLD: ', label_str[print_idx])
                print('PRED: ', pred_str[print_idx])
                print()

        eval_bleu = computeBLEU(prediction_strs, [[x] for x in label_strs])
        eval_ppl = math.exp(eval_ppl / nb_eval_examples)
        eval_em = eval_em / nb_eval_examples
        eval_loss = eval_loss / nb_eval_steps
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_em': eval_em,
            'eval_bleu': eval_bleu,
            'eval_ppl': eval_ppl,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Best Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        output_preds_file = os.path.join(
            args.output_dir, f"{args.eval_preds_prefix}_{args.setting}.txt")
        with open(output_preds_file, 'w') as writer:
            logger.info("Writing predictions")
            for p in prediction_strs:
                writer.write(p + '\n')

    if args.do_test:
        # Load a trained model that you have fine-tuned
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTLMHeadModel(model.config)
        model.load_state_dict(model_state_dict)
        model.to(device)
        model.eval()
        eval_loss, eval_em, eval_ppl = 0, 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        label_strs, prediction_strs = [], []
        n_words = 0
        for batch in test_dataloader:
            inputs = batch[0].to(device)

            with torch.no_grad():
                preds = sample(model, batch[1], args.n_gen, device)

            pred_str = detok_batch(preds)
            prediction_strs.extend(pred_str)

        output_preds_file = os.path.join(
            args.output_dir, f"{args.test_preds_prefix}_{args.setting}.txt")
        with open(output_preds_file, 'w') as writer:
            logger.info("Writing predictions")
            for p in prediction_strs:
                writer.write(f'"{p.strip()}"\n')
Exemple #11
0
 def train(self):
     if self.debug_mode: self.epochs = 1
     # 加载 dataloader
     train_loader, valid_loader = self.create_dataloader()
     # 训练
     self.seed_everything()
     lr = 2e-5
     accumulation_steps = math.ceil(self.batch_size / self.base_batch_size)
     # 加载预训练模型
     print("Load pre-trained model")
     model = GPT2NeuralNet.from_pretrained(self.gpt2_model_path,
                                           cache_dir=None)
     model.zero_grad()
     model = model.to(self.device)
     """
     # 不同的参数组设置不同的 weight_decay
     param_optimizer = list(model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
     ]
     """
     epoch_steps = int(self.train_len * 0.5 / self.base_batch_size /
                       accumulation_steps)
     num_train_optimization_steps = int(self.epochs * epoch_steps)
     valid_every = math.floor(epoch_steps * accumulation_steps / 5)
     optimizer = OpenAIAdam(model.parameters(),
                            lr=lr,
                            warmup=0.05,
                            t_total=num_train_optimization_steps)
     # 渐变学习速率
     #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
     model, optimizer = amp.initialize(model,
                                       optimizer,
                                       opt_level="O1",
                                       verbosity=0)
     # 开始训练
     print("Train")
     best_auc_score_1 = 0
     best_auc_score_2 = 0
     best_auc_score_3 = 0
     best_auc_score_4 = 0
     f_log = open("train_log.txt", "w")
     for epoch in range(self.epochs):
         model.train()
         optimizer.zero_grad()
         # 加载每个 batch 并训练
         train_start_time = time.time()
         for i, batch_data in enumerate(train_loader):
             x_batch = batch_data[0]
             y_batch = batch_data[1]
             target_weight_batch = batch_data[2]
             aux_weight_batch = batch_data[3]
             identity_weight_batch = batch_data[4]
             np_weight_batch = batch_data[5]
             np_identity_weight_batch = batch_data[6]
             y_pred = model(x_batch.to(self.device))
             target_loss, aux_loss, identity_loss, np_loss = self.custom_loss(
                 y_pred, y_batch, epoch, target_weight_batch,
                 aux_weight_batch, identity_weight_batch, np_weight_batch)
             loss = target_loss + aux_loss + identity_loss + np_loss
             with amp.scale_loss(loss, optimizer) as scaled_loss:
                 scaled_loss.backward()
             if (i + 1) % accumulation_steps == 0:
                 optimizer.step()
                 optimizer.zero_grad()
             # 验证
             if (i + 1) % valid_every == 0:
                 model.eval()
                 stage = int((i + 1) / valid_every)
                 train_stage_duration = int(
                     (time.time() - train_start_time) / 60)
                 valid_start_time = time.time()
                 y_pred = np.zeros((len(self.train_df) - self.train_len))
                 for j, valid_batch_data in enumerate(valid_loader):
                     x_batch = valid_batch_data[0]
                     batch_y_pred = self.sigmoid(
                         model(x_batch.to(
                             self.device)).detach().cpu().numpy())[:, 0]
                     y_pred[j * self.base_batch_size:(j + 1) *
                            self.base_batch_size] = batch_y_pred
                 # 计算得分
                 auc_score = self.evaluator.get_final_metric(y_pred)
                 valid_duration = int((time.time() - valid_start_time) / 60)
                 train_start_time = time.time()
                 f_log.write(
                     "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f\n"
                     % (epoch, stage, train_stage_duration, valid_duration,
                        auc_score))
                 print(
                     "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f"
                     % (epoch, stage, train_stage_duration, valid_duration,
                        auc_score))
                 if auc_score > best_auc_score_4:
                     state_dict = model.state_dict()
                     if auc_score > best_auc_score_1:
                         best_auc_score_1 = auc_score
                         torch.save(state_dict, "model1.bin")
                     elif auc_score > best_auc_score_2:
                         best_auc_score_2 = auc_score
                         torch.save(state_dict, "model2.bin")
                     elif auc_score > best_auc_score_3:
                         best_auc_score_3 = auc_score
                         torch.save(state_dict, "model3.bin")
                     else:
                         best_auc_score_4 = auc_score
                         torch.save(state_dict, "model4.bin")
                     with open("model_score.txt", "w") as f:
                         f.write(
                             "model1: %.4f model2: %.4f model3: %.4f model4: %.4f"
                             % (best_auc_score_1, best_auc_score_2,
                                best_auc_score_3, best_auc_score_4))
                     print(
                         "model1: %.4f model2: %.4f model3: %.4f model4: %.4f"
                         % (best_auc_score_1, best_auc_score_2,
                            best_auc_score_3, best_auc_score_4))
                 model.train()
         if self.last is True:
             state_dict = model.state_dict()
             torch.save(state_dict, "model_last.bin")
     # del 训练相关输入和模型
     training_history = [train_loader, valid_loader, model, optimizer]
     for variable in training_history:
         del variable
     gc.collect()
Exemple #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .csv files (or other data files) for the task."
    )
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--answer_only",
        default=False,
        action='store_true',
        help="Whether to run with answers only (blank out question).")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--load_model_from",
        default=None,
        type=str,
        help=
        "The saved model file to load before doing any training or eval (if both --do_train and --do_eval are specified, the saved model will be loaded, then trained, then the trained model will be evaluated)."
    )
    parser.add_argument(
        '--train_filename',
        type=str,
        default='train.csv',
        help="Filename to load train data from (relative to data_dir)")
    parser.add_argument(
        '--eval_filename',
        type=str,
        default='val.csv',
        help="File to load eval data from (relative to data_dir)")
    parser.add_argument(
        '--data_format',
        type=str,
        choices=['swag', 'codah'],
        default='swag',
        help=
        "Format of the train and eval files (original SWAG CSV format vs our TSV format)"
    )
    parser.add_argument(
        '--model_labels_save_filename',
        type=str,
        default='model_labels.json',
        help=
        "JSON file to save model outputs/labels to (relative to output_dir)")
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=32)
    parser.add_argument('--eval_batch_size', type=int, default=8)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.5)
    parser.add_argument('--n_valid', type=int, default=374)
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=8,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if args.do_eval and (not args.do_train) and args.load_model_from is None:
        args.load_model_from = os.path.join(args.output_dir,
                                            'pytorch_model.bin')

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))

    config = model.config
    if args.load_model_from:
        model_state_dict = torch.load(args.load_model_from)
        model = OpenAIGPTDoubleHeadsModel(config)
        model.load_state_dict(model_state_dict)
    model.to(device)

    # Load and encode the datasets
    logger.info("Loading datasets...")
    datasets = []
    dataset_keys = dict()
    if args.do_train:
        train_dataset = read_swag_examples(os.path.join(
            args.data_dir, args.train_filename),
                                           is_training=True,
                                           answer_only=args.answer_only,
                                           data_format=args.data_format)
        train_dataset = [
            EncodedSwagExample(ex, tokenizer)
            for ex in tqdm(train_dataset, desc='Encoding train')
        ]
        dataset_keys['train'] = len(datasets)
        datasets.append(train_dataset)

    if args.do_eval:
        eval_dataset = read_swag_examples(os.path.join(args.data_dir,
                                                       args.eval_filename),
                                          is_training=True,
                                          answer_only=args.answer_only,
                                          data_format=args.data_format)
        eval_dataset = [
            EncodedSwagExample(ex, tokenizer)
            for ex in tqdm(eval_dataset, desc='Encoding eval')
        ]
        dataset_keys['eval'] = len(datasets)
        datasets.append(eval_dataset)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(swagex.context_tokens[:max_length]) + len(swagex.start_ending_tokens[:max_length]) + max(len(ending[:max_length]) for ending in swagex.endings_tokens) + 3  \
                           for dataset in datasets for swagex in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model
    print('---')
    print('Input length: {}\n'.format(input_length))
    print('---')

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(datasets, input_length, max_length,
                                           *special_tokens_ids)
    if args.do_train:
        train_tensor_dataset = tensor_datasets[dataset_keys['train']]
    if args.do_eval:
        eval_tensor_dataset = tensor_datasets[dataset_keys['eval']]

    # Prepare optimizer
    if args.do_train:
        train_data = TensorDataset(*train_tensor_dataset)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        #num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
        num_train_optimization_steps = int(
            len(train_data) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                nb_tr_steps += 1
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()

    # Save a trained model
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        torch.save(model_to_save.state_dict(), output_model_file)

    if args.do_eval:
        eval_data = TensorDataset(*eval_tensor_dataset)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        # Load a trained model that you have fine-tuned
        if args.do_train:
            model_state_dict = torch.load(output_model_file)
            model = OpenAIGPTDoubleHeadsModel(config)
            model.load_state_dict(model_state_dict)
            model.to(device)
        model.eval()

        all_model_outputs = []
        data_index = 0

        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss = model(input_ids, mc_token_ids, lm_labels,
                                   mc_labels)
                _, mc_logits = model(input_ids, mc_token_ids)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            for i in range(input_ids.size(0)):
                output_obj = dict()
                output_obj['logits'] = [float(x) for x in mc_logits[i]]
                output_obj['true_label'] = int(mc_labels[i])
                output_obj['model_label'] = int(np.argmax(mc_logits[i]))
                output_obj['swag_data'] = datasets[
                    dataset_keys['eval']][data_index].raw_example.to_dict()
                all_model_outputs.append(output_obj)
                data_index += 1

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        with open(
                os.path.join(args.output_dir, args.model_labels_save_filename),
                'w') as f:
            json.dump(all_model_outputs, f)
Exemple #13
0
class ModelClassifier(object):
    def __init__(self, config, which_to_train, model_A, model_B, tokenizer, device1, device2):
        # config.num_labels = le.classes_.shape[0]
        # label encode
        # super().__init__()
        self.config = config
        self.le_A = load_pkl("training/data/labelencoder_A.pkl")
        self.le_B = load_pkl("training/data/labelencoder_B.pkl")

        self.clf_A = SequenceSummary(num_labels=self.le_A.classes_.shape[0], config=config)
        self.clf_B = SequenceSummary(num_labels=self.le_B.classes_.shape[0], config=config)
        self.clf_TF = SequenceSummary(num_labels=2, config=config)
        
        # self.apply(self.init_weight)
        self.past = None
        self.history = []

        # model
        self.model_A = model_A
        self.model_B = model_B
        self.tokenizer = tokenizer
        self.cls_token_id = tokenizer.cls_token_id
        self.device1 = device1
        self.device2 = device2

        self.to_device(self.device1)

        # define loss
        self.criterion = nn.CrossEntropyLoss()
        
        # optimizer parameters
        self.num_gradients_accumulation = 1
        self.batch_size = 1
        self.batch_size_TF = 8

        # load training data
        self.load_data()

    def reload(self):
        self.past = None
        self.history = []

    def to_device(self, device):
        # to device
        self.clf_A = self.clf_A.to(device)
        self.clf_B = self.clf_B.to(device)
        self.clf_TF = self.clf_TF.to(device)

        self.clf_A.device = device
        self.clf_B.device = device
        self.clf_TF.device = device
        # self.model_A = self.model_A.to(self.device)
        # self.model_B = self.model_B.to(self.device)

    def load_data(self):
        # load training data
        self.train_data = load_pkl("training/data/train_data.pkl")
        self.val_data = load_pkl("training/data/val_data.pkl")
        self.train_data_TF, self.val_data_TF = torch.load("demonstration/old_model/demonstration_train_with_text_only.pkl", map_location="cpu"), \
                                                torch.load("demonstration/old_model/demonstration_val_with_text_only.pkl", map_location="cpu")

        self.train_dataset = PersuadeDataset(self.train_data, self.tokenizer)
        self.val_dataset = PersuadeDataset(self.val_data, self.tokenizer)

        self.train_dataset_TF, self.val_dataset_TF = TFDataset(self.train_data_TF, self.tokenizer), \
                                                        TFDataset(self.val_data_TF, self.tokenizer)

        self.train_dataloader = DataLoader(dataset=self.train_dataset, 
                                            shuffle=True, 
                                            batch_size=self.batch_size, 
                                            collate_fn=self.train_dataset.collate)
        self.val_dataloader = DataLoader(dataset=self.val_dataset, 
                                            shuffle=False, 
                                            batch_size=self.batch_size, 
                                            collate_fn=self.train_dataset.collate)

        self.train_dataloader_TF = DataLoader(dataset=self.train_dataset_TF, 
                                            shuffle=True, 
                                            batch_size=self.batch_size_TF, 
                                            collate_fn=self.train_dataset_TF.collate)
        self.val_dataloader_TF = DataLoader(dataset=self.val_dataset_TF, 
                                            shuffle=False, 
                                            batch_size=self.batch_size_TF, 
                                            collate_fn=self.val_dataset_TF.collate)


    def load_model(self, all_model_dir=None, clf_A_dir=None, clf_B_dir=None, clf_TF_dir=None):
        if all_model_dir is None:
            if clf_A_dir:
                clf_A_state = torch.load(clf_A_dir)
                self.clf_A.load_state_dict(clf_A_state)
                print(f"clf_A loaded")

            if clf_B_dir:
                clf_B_state = torch.load(clf_B_dir)
                self.clf_B.load_state_dict(clf_B_state)
                print(f"clf_B loaded")

            if clf_TF_dir:
                clf_TF_state = torch.load(clf_TF_dir)
                self.clf_TF.load_state_dict(clf_TF_state)
                print(f"clf_TF loaded")
        else:
            model_A_state, model_B_state, clf_A_state, clf_B_state, clf_TF_state = torch.load(all_model_dir)
            self.model_A.load_state_dict(model_A_state)
            self.model_B.load_state_dict(model_B_state)
            self.clf_A.load_state_dict(clf_A_state)
            self.clf_B.load_state_dict(clf_B_state)
            self.clf_TF.load_state_dict(clf_TF_state)
            print(f"all models loaded")


    def train(self, which_to_train, num_epochs=10):
        # optimizer
        param_optimizer = list(self.model_A.named_parameters()) + \
                          list(self.model_B.named_parameters())
        if "A" in which_to_train:
            print("clf_A to optimize")
            param_optimizer += list(self.clf_A.named_parameters()) 
        if "B" in which_to_train:
            print("clf_B to optimize")
            param_optimizer += list(self.clf_B.named_parameters()) 
        if "TF" in which_to_train:
            print("clf_TF to optimize")
            param_optimizer += list(self.clf_TF.named_parameters()) 

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        num_train_optimization_steps = len(self.train_dataset) * num_epochs // self.batch_size // self.num_gradients_accumulation

        self.optimizer = OpenAIAdam(optimizer_grouped_parameters,
                            lr=2e-5,
                            warmup=0.1,
                            max_grad_norm=1.0,
                            weight_decay=0.01,
                            t_total=num_train_optimization_steps)


        update_count = 0
        progress_bar = tqdm.tqdm
        start = time.time()
        best_acc_A = -float('Inf')
        best_f1_A = -float('Inf')
        best_acc_B = -float('Inf')
        best_f1_B = -float('Inf')
        best_acc_TF = -float('Inf')
        best_f1_TF = -float('Inf')


        for ep in tqdm.tqdm(range(num_epochs)):
            
            # set train mode
            self.model_A.train()
            self.model_B.train()
            self.clf_A.train()
            self.clf_B.train()
            self.clf_TF.train()
            
            "Training"
            pbar = progress_bar(self.train_dataloader)
            train_dataloader_TF_list = list(self.train_dataloader_TF)
            
            for i, batch in enumerate(pbar):
                batch = batch[0]
                batch_TF = train_dataloader_TF_list[i%len(train_dataloader_TF_list)]
                # without relative position
                # if sum([len(item) for item in batch[1]]) > 1024:
                #     input("1024 here!")
                #     continue

                record_loss = self.train_one_iter(batch, batch_TF, update_count, which_to_train, fp16=False)
                update_count += 1

                if update_count % self.num_gradients_accumulation == self.num_gradients_accumulation - 1:
                    # update for gradient accumulation
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    
                    # speed measure
                    end = time.time()
                    speed = self.batch_size * self.num_gradients_accumulation / (end - start)
                    start = end
                    
                    # show progress
                    pbar.set_postfix(loss=record_loss, speed=speed)

            "Evaluation"
            self.model_A.eval()
            self.model_B.eval()
            self.clf_A.eval()
            self.clf_B.eval()
            self.clf_TF.eval()

            (val_acc_A, val_f1_A), (val_acc_B, val_f1_B), (val_acc_TF, val_f1_TF) = self.validate(self.val_dataloader, self.val_dataloader_TF, ep, which_to_train)
            print(f"A: val f1: {val_f1_A}, valid acc: {val_acc_A}")
            print(f"B: val f1: {val_f1_B}, valid acc: {val_acc_B}")
            print(f"TF: val f1: {val_f1_TF}, valid acc: {val_acc_TF}")
            is_best_so_far_TF = val_f1_TF > best_f1_TF
            is_best_so_far_A = val_f1_A > best_f1_A
            is_best_so_far_B = val_f1_TF > best_f1_B
            
            if is_best_so_far_TF:
                best_acc_TF = val_acc_TF
                best_f1_TF = val_f1_TF
            if is_best_so_far_A:
                best_acc_A = val_acc_A
                best_f1_A = val_f1_A
            if is_best_so_far_B:
                best_acc_B = val_acc_B
                best_f1_B = val_f1_B
            SAVED = False
            if is_best_so_far_TF and not SAVED:
                SAVED = True
                torch.save((self.model_A.state_dict(), self.model_B.state_dict(), 
                            self.clf_A.state_dict(), self.clf_B.state_dict(), 
                            self.clf_TF.state_dict()),                             
                            f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth")
            if is_best_so_far_A and not SAVED:
                SAVED = True
                torch.save((self.model_A.state_dict(), self.model_B.state_dict(), 
                            self.clf_A.state_dict(), self.clf_B.state_dict(), 
                            self.clf_TF.state_dict()),                             
                            f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth")
            if is_best_so_far_B and not SAVED:
                SAVED = True
                torch.save((self.model_A.state_dict(), self.model_B.state_dict(), 
                            self.clf_A.state_dict(), self.clf_B.state_dict(), 
                            self.clf_TF.state_dict()),                             
                            f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth")
                 # if which_to_train == "A":
                #     torch.save(model_A.state_dict(), f"Checkpoint_act_clf/A/best_acc_{best_acc}_f1_{best_f1}.pth")
                # elif which_to_train == "B":
                #     torch.save(model_A.state_dict(), f"Checkpoint_act_clf/B/best_acc_{best_acc}_f1_{best_f1}.pth")
                # checkpointer.save_checkpoint(ep, model_A.state_dict(), {"None": None}, is_best_so_far)

        print("finally")
        print("A: \nbest acc: {}, best f1: {}".format(best_acc_A, best_f1_A))
        print("B: \nbest acc: {}, best f1: {}".format(best_acc_B, best_f1_B))
        print("TF: \nbest acc: {}, best f1: {}".format(best_acc_TF, best_f1_TF))

    def validate(self, dataloader, dataloader_TF, ep, which_to_train):
        from sklearn.metrics import f1_score
        from sklearn.metrics import confusion_matrix
        from utils import print_cm

        # evaluation mode
        self.model_A.eval()
        self.model_B.eval()
        self.clf_A.eval()
        self.clf_B.eval()
        self.clf_TF.eval()

        def get_numbers_for_one_task(sents, logits, acts, x, y_true, y_pred, total, correct):
            _, predicted_acts = torch.max(logits, 1)
        
            x.extend(sents)
            y_true.extend(acts.tolist()[0])
            y_pred.extend(predicted_acts.tolist())

            total += len(acts.tolist()[0])
            correct += (predicted_acts == acts).sum().item()

            return x, y_true, y_pred, total, correct

        progress_bar = tqdm.tqdm

        with torch.no_grad():
            pbar = progress_bar(dataloader)
            dataloader_TF_list = list(dataloader_TF)
            correct = 0
            total = 0
            x_A, y_true_A, y_pred_A, correct_A, total_A = [], [], [], 0, 0
            x_B, y_true_B, y_pred_B, correct_B, total_B = [], [], [], 0, 0
            x_TF, y_true_TF, y_pred_TF, correct_TF, total_TF = [], [], [], 0, 0

            for i, batch in enumerate(pbar):
                batch = batch[0]
                batch_TF = dataloader_TF_list[i%len(dataloader_TF_list)]
                # if sum([len(item) for item in batch[1]]) > 1024:
                #     continue

                sents_A, logits_A, acts_A,\
                sents_B, logits_B, acts_B,\
                sents_TF, logits_TF, acts_TF = self.train_one_iter(batch, batch_TF, None, which_to_train, fp16=False, 
                                                   is_validation=True)
                
                x_A, y_true_A, y_pred_A, total_A, correct_A = get_numbers_for_one_task(sents_A, logits_A, acts_A,\
                                                                                       x_A, y_true_A, y_pred_A, total_A, correct_A)
                x_B, y_true_B, y_pred_B, total_B, correct_B = get_numbers_for_one_task(sents_B, logits_B, acts_B,\
                                                                                       x_B, y_true_B, y_pred_B, total_B, correct_B)
                x_TF, y_true_TF, y_pred_TF, total_TF, correct_TF = get_numbers_for_one_task(sents_TF, logits_TF, acts_TF,\
                                                                                       x_TF, y_true_TF, y_pred_TF, total_TF, correct_TF)

            f1_A = f1_score(y_true_A, y_pred_A, average="weighted")
            f1_B = f1_score(y_true_B, y_pred_B, average="weighted")
            f1_TF = f1_score(y_true_TF, y_pred_TF, average="binary")
            # pdb.set_trace()
            
            pd.DataFrame(zip(x_A, self.le_A.inverse_transform(y_true_A).tolist(), self.le_A.inverse_transform(y_pred_A).tolist()),
                        columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/A/act_classifier_val_results_epoch{ep}.csv", index=None)
            print(f"A: Epoch {ep} Validation accuracy: {correct_A/total_A}, f1: {f1_A}")
            
            pd.DataFrame(zip(x_B, self.le_B.inverse_transform(y_true_B).tolist(), self.le_B.inverse_transform(y_pred_B).tolist()),
                        columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/B/act_classifier_val_results_epoch{ep}.csv", index=None)
            print(f"B: Epoch {ep} Validation accuracy: {correct_B/total_B}, f1: {f1_B}")
            
            pd.DataFrame(zip(x_TF, y_true_TF, y_pred_TF),
                        columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/TF/act_classifier_val_results_epoch{ep}.csv", index=None)
            print(f"TF: Epoch {ep} Validation accuracy: {correct_TF/total_TF}, f1: {f1_TF}")
            # print_cm(confusion_matrix(y_true, y_pred, labels=range(len(le.classes_))), labels=[l[:] for l in le.classes_.tolist()])
            return (correct_A/total_A, f1_A), (correct_B/total_B, f1_B), (correct_TF/total_TF, f1_TF)

    def set_past(self, sent, which_task):
        "sent: str, a whole sent"
        # assert sent.startswith("A:") or sent.startswith("B:")
        if sent.startswith("A:") or sent.startswith("B:"):
            pdb.set_trace()
            sent = sent[2:]

        if which_task == "A":
            lm_model = self.model_A
            prefix = "A:"
            device = lm_model.device
        elif which_task == "B":
            lm_model = self.model_B
            prefix = "B:"
            device = lm_model.device
        elif which_task == "TF":
            lm_model = self.model_A
            prefix = "A:"
            # candidate_sent = prefix+" ".join(separate_sents)
            device = lm_model.device
        
        # encode sent
        self.history.append(prefix+sent)
        sent = self.tokenizer.encode(prefix) + self.tokenizer.encode(sent) + self.train_dataset.turn_ending
        sent = torch.LongTensor(sent).unsqueeze(0).to(device)

        past = self.move_to_device(self.past, lm_model)
        _, past, _ = lm_model(sent, past) 
        
        self.past = past
        

    def predict(self, separate_sents, which_task):
        "separate_sents: list of sentences with no prefix"
        past = self.past
        
        if which_task == "A":
            lm_model = self.model_A
            clf_head = self.clf_A
            le = self.le_A
            prefix = "A:"
            device = lm_model.device
        elif which_task == "B":
            lm_model = self.model_B
            clf_head = self.clf_B
            le = self.le_B
            prefix = "B:"
            device = lm_model.device
        elif which_task == "TF":
            lm_model = self.model_A
            clf_head = self.clf_TF
            prefix = "A:"
            candidate_sent = " ".join(separate_sents)
            device = lm_model.device
        
        # evaluation mode
        self.model_A.eval()
        self.model_B.eval()
        self.clf_A.eval()
        self.clf_B.eval()
        self.clf_TF.eval()

        with torch.no_grad():
            if which_task in ["A", "B"]:
                all_logits = []
                for i, sent in enumerate(separate_sents):
                    if i == 0:
                        sent = self.tokenizer.encode(prefix) + self.tokenizer.encode(sent)
                    else:
                        sent = self.tokenizer.encode(" "+sent)

                    # pdb.set_trace()
                    sent = torch.LongTensor(sent).unsqueeze(0).to(device)
                    past = self.move_to_device(past, lm_model)
                    logits, past, hidden_states = lm_model(sent, past)
                    
                    # encode [CLS]
                    cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(device)
                    _, _, hidden_states = lm_model(cls_token_tensor, past)
                    hidden_states = self.move_to_device(hidden_states, clf_head)
                    mc_logits = clf_head(hidden_states[-1], cls_index=None).squeeze(-1)
                    
                    all_logits.append(mc_logits)

                # finish tail
                end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(device)
                past = self.move_to_device(past, lm_model)
                _, past, _ = lm_model(end_input, past) 

                # get labels
                all_logits = torch.cat(all_logits, dim=0)
                # pdb.set_trace()
                _, predicted_acts = torch.max(all_logits, 1)
                predicted_acts = predicted_acts.tolist()
                predicted_acts = le.inverse_transform(predicted_acts).tolist()

                return predicted_acts, past
            elif which_task == "TF":
                # encode candidate
                candidate = self.tokenizer.encode(prefix) + self.tokenizer.encode(candidate_sent)
                # pdb.set_trace()
                candidate = torch.LongTensor(candidate).unsqueeze(0).to(device)
                past = self.move_to_device(past, self.model_A)
                logits, past, hidden_states = self.model_A(candidate, past)
                # encode [CLS]
                cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(device)
                _, _, hidden_states = self.model_A(cls_token_tensor, past)
                hidden_states = self.move_to_device(hidden_states, self.clf_TF)
                mc_logits = self.clf_TF(hidden_states[-1], cls_index=None).squeeze(-1)
                # pdb.set_trace()
                _, predicted_acts = torch.max(mc_logits, 1)
                predicted_acts = predicted_acts.tolist()
                assert len(predicted_acts) == 1
                return predicted_acts[0], past

    def train_one_iter(self, batch, batch_TF, update_count, which_to_train, fp16=False, is_validation=False):
        # role_ids, whole_sents, separate_sents, acts = batch
        past = None
        all_sents_A, all_logits_A, all_acts_A = [], [], []
        all_sents_B, all_logits_B, all_acts_B = [], [], []
        for i, (role_id, whole_sent, separate_sents, acts) in enumerate(zip(*batch)):            
            if role_id == 0:
                whole_sent = torch.LongTensor(whole_sent).unsqueeze(0).to(self.device1)
                try:
                    assert self.tokenizer.decode(whole_sent[0][:2].tolist()) == "A:"
                except:
                    pdb.set_trace()
                if "A" in which_to_train:
                    past = self.move_to_device(past, self.model_A)
                    _, real_past, _ = self.model_A(whole_sent, past)
                    for act, sent in zip(acts, separate_sents):
                        all_sents_A.append(self.tokenizer.decode(sent))
                        # pdb.set_trace()
                        # 'A:HI I would like to tell you About a childrens charity called Save the CHildren.'
                        sent = torch.LongTensor(sent).unsqueeze(0).to(self.device1)
                        past = self.move_to_device(past, self.model_A)
                        logits, past, hidden_states = self.model_A(sent, past)
                        
                        # pdb.set_trace()
                        # encode [CLS]
                        cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device1)
                        past = self.move_to_device(past, self.model_A)
                        _, _, hidden_states = self.model_A(cls_token_tensor, past)
                        
                        mc_logits = self.clf_A(hidden_states[-1], cls_index=None).squeeze(-1)
                        all_logits_A.append(mc_logits)
                        all_acts_A.append(act)
                    # pdb.set_trace()
                    past = real_past
                    # # finish tail
                    # end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(self.device1)
                    # _, past, _ = self.model_A(end_input, past) 
                else:
                    past = self.move_to_device(past, self.model_A)
                    _, past, hidden_states = self.model_A(whole_sent, past)
            else:
                whole_sent = torch.LongTensor(whole_sent).unsqueeze(0).to(self.device2)
                try:
                    assert self.tokenizer.decode(whole_sent[0][:2].tolist()) == "B:"
                except:
                    pdb.set_trace()
                if "B" in which_to_train:
                    past = self.move_to_device(past, self.model_B)
                    _, real_past, _ = self.model_B(whole_sent, past)
                    for act, sent in zip(acts, separate_sents):
                        all_sents_B.append(self.tokenizer.decode(sent))
                        # pdb.set_trace()
                        #'B:ok please do'
                        sent = torch.LongTensor(sent).unsqueeze(0).to(self.device2)
                        past = self.move_to_device(past, self.model_B)
                        logits, past, hidden_states = self.model_B(sent, past)
                        
                        # encode [CLS]
                        cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device2)
                        _, _, hidden_states = self.model_B(cls_token_tensor, past)
                        
                        hidden_states = self.move_to_device(hidden_states, self.clf_B)
                        mc_logits = self.clf_B(hidden_states[-1], cls_index=None).squeeze(-1)
                        all_logits_B.append(mc_logits)
                        all_acts_B.append(act)
                    past = real_past 
                    # finish tail
                    # end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(self.device2)
                    # past = self.move_to_device(past, self.model_B)
                    # _, past, _ = self.model_B(end_input, past) 
                else:
                    past = self.move_to_device(past, self.model_B)
                    _, past, hidden_states = self.model_B(whole_sent, past)
        
        all_logits_A = torch.cat(all_logits_A, dim=0)
        all_acts_A = torch.tensor(all_acts_A).unsqueeze(0).to(self.device1)
        # pdb.set_trace()
        loss_A = self.criterion(all_logits_A.view(-1, all_logits_A.size(-1)), all_acts_A.view(-1))

        all_logits_B = torch.cat(all_logits_B, dim=0)
        all_acts_B = torch.tensor(all_acts_B).unsqueeze(0).to(self.device1)

        loss_B = self.criterion(all_logits_B.view(-1, all_logits_B.size(-1)), all_acts_B.view(-1))
        
        # TF task
        all_contexts_candidate_TF = []
        all_logits_TF = []
        all_acts_TF = []
        for one_dial in batch_TF:
            past = None
            contexts, candidate, pick_or_not = one_dial
            all_contexts_candidate_TF.append((" ".join([self.tokenizer.decode(c) for c in contexts]), 
                                              self.tokenizer.decode(candidate)))
            
            # get past
            for i, context in enumerate(contexts):
                if i%2 == 0:
                    # pdb.set_trace()
                    #'A:Would you like to know more about the charity Save the Children?\n\n\n'
                    context = torch.LongTensor(context).unsqueeze(0).to(self.device1)
                    past = self.move_to_device(past, self.model_A)
                    logits, past, hidden_states = self.model_A(context, past)
                else:
                    # pdb.set_trace()
                    #'B:hello I am great.\n\n\n'
                    context = torch.LongTensor(context).unsqueeze(0).to(self.device2)
                    past = self.move_to_device(past, self.model_B)
                    logits, past, hidden_states = self.model_B(context, past)
            
            # encode candidate
            # pdb.set_trace()
            # "A:Save the Children is an international non-governmental organization that promotes children's rights, provides relief and helps support children in developing countries."
            candidate = torch.LongTensor(candidate).unsqueeze(0).to(self.device1)
            past = self.move_to_device(past, self.model_A)
            logits, past, hidden_states = self.model_A(candidate, past)
            # encode [CLS]
            cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device1)
            _, _, hidden_states = self.model_A(cls_token_tensor, past)
            
            mc_logits = self.clf_TF(hidden_states[-1], cls_index=None).squeeze(-1)
            all_logits_TF.append(mc_logits)
            all_acts_TF.append(pick_or_not)

        all_logits_TF = torch.cat(all_logits_TF, dim=0)
        all_acts_TF = torch.tensor(all_acts_TF).unsqueeze(0).to(self.device1)

        loss_TF = self.criterion(all_logits_TF.view(-1, all_logits_TF.size(-1)), all_acts_TF.view(-1))
        
        if is_validation:
            return all_sents_A, all_logits_A, all_acts_A,\
                   all_sents_B, all_logits_B, all_acts_B,\
                   all_contexts_candidate_TF, all_logits_TF, all_acts_TF

        loss = loss_A.to(self.device1) + loss_B.to(self.device1) + loss_TF.to(self.device1)

        loss /= self.num_gradients_accumulation
        
        if fp16:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        
        record_loss = loss.item() * self.num_gradients_accumulation
        
        return record_loss#, perplexity

    def move_to_device(self, past, target):
        if past is not None and target.device != past[0].device:
            past = [p.to(target.device) for p in past]
        return past
Exemple #14
0
def main():
    global global_example_count, global_token_count, event_writer, logdir

    args = parse_args()

    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.device = device
    use_cuda = (str(device) == 'cuda')

    if not use_cuda:
        print(
            f'WARNING: --fp16 requires --cuda, have {device}, ignoring --fp16 option'
        )
        args.fp16 = False
    else:
        try:
            from apex.fp16_utils import FP16_Optimizer
        except:
            print('WARNING: apex not installed, ignoring --fp16 option')
            args.fp16 = False

    logdir = f'{args.logdir_root}/{args.run_name}-{current_timestamp()}'
    os.system(f'mkdir -p {logdir}')
    os.system(f'mkdir -p {args.output_dir}')
    assert os.path.exists(args.data), f"Didn't find {args.data}"

    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)

    if args.fp16:
        model = model.half()

    model.to(device)

    # setup TensorBoard logging
    global_example_count = 0
    global_token_count = 0
    print(f"Logging to {logdir}")
    event_writer = SummaryWriter(logdir)
    log_tb("first", time.time())

    data_loader = get_data_loader(args.data, enc, args.batch_size, args)

    # ## Prep optimizer
    # We use OpenAIAdam because that's what run_openai_gpt used
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(data_loader) * args.num_train_epochs

    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if use_cuda and args.fp16:
        # If args.dynamic_loss_scale is False, static_loss_scale will be used.
        # If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale.
        optimizer = FP16_Optimizer(optimizer,
                                   static_loss_scale=1,
                                   dynamic_loss_scale=0,
                                   dynamic_loss_args={'init_scale': 2**16})

    # Reset all model weights so we can train from scratch.
    model.apply(model.init_weights)
    model.train()

    for current_epoch in range(args.num_train_epochs):
        data_loader_iter = iter(data_loader)
        for step in range(len(data_loader)):
            start_batch_ts = time.time()
            with timeit('dataloader'):
                batch = next(data_loader_iter)
            with timeit('batch.to'):
                batch = batch.to(device)
            with timeit('loss'):
                loss = model(batch, lm_labels=batch)
            with timeit('loss.backward'):
                loss.backward()
            with timeit('optimizer.step'):
                optimizer.step()
            optimizer.zero_grad()
            end_batch_ts = time.time()

            # time to do single batch
            batch_time = end_batch_ts - start_batch_ts

            total_tokens = args.context_length * args.batch_size
            time_per_token = batch_time / total_tokens
            time_per_sample = batch_time / args.batch_size
            log_tb('times/tokens_per_sec', 1 / time_per_token)
            log_tb('times/samples_per_sec', 1 / time_per_sample)
            log_tb('times/step', 1000 * batch_time)

            if step % args.print_freq == 0:
                log_tb("memory/allocated_gb",
                       torch.cuda.memory_allocated() / 1e9)
                log_tb("memory/max_allocated_gb",
                       torch.cuda.max_memory_allocated() / 1e9)
                log_tb("memory/cached_gb", torch.cuda.memory_cached() / 1e9)
                log_tb("memory/max_cached_gb",
                       torch.cuda.max_memory_cached() / 1e9)

                print('loss', loss.item())
                # FP16Optimizer doesn't support get_lr
                #                print('lr', optimizer.get_lr()[0])
                log_tb('loss', loss.item())
                #                log_tb('lr', optimizer.get_lr()[0])

                with timeit('sample'):
                    sample = print_samples(
                        model,
                        enc,
                        args,
                        # Context is a random sample from the dataset.
                        context_tokens=next(iter(data_loader)),
                        batch_size=1,
                        length=20,
                        nsamples=1,
                        temperature=1,
                        top_k=40)
                event_writer.add_text('sample', sample, global_example_count)

            # TODO: replace with len(batch)
            global_example_count += args.batch_size
            global_token_count += total_tokens

        # checkpoint at the end of each epoch
        print("Checkpointing at epoch ", current_epoch)
        checkpoint(model, args)
Exemple #15
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_csqa_dataset(args.train_dataset)

    print("Splitting train 90-10 into train-dev.")
    dev_dataset = train_dataset[int(len(train_dataset) * 0.9):]
    train_dataset = train_dataset[:int(len(train_dataset) * 0.9)]
    test_dataset = load_csqa_dataset(args.eval_dataset)
    datasets = (train_dataset, dev_dataset, test_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the mex input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(
        len(question[:max_length]) +
        max(len(answer1[:max_length]), len(answer2[:max_length]),
            len(answer3[:max_length])) + 3 for dataset in encoded_datasets
        for question, answer1, answer2, answer3, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset = tensor_datasets[0]
    dev_tensor_dataset = tensor_datasets[1]
    test_tensor_dataset = tensor_datasets[2]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    dev_data = TensorDataset(*dev_tensor_dataset)
    dev_sampler = RandomSampler(dev_data)
    dev_dataloader = DataLoader(dev_data,
                                sampler=dev_sampler,
                                batch_size=args.train_batch_size)

    test_data = TensorDataset(*test_tensor_dataset)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data,
                                 sampler=test_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(
        train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        best_dev_accuracy = 0
        test_acc_best_dev = 0
        best_dev_epoch = 0
        no_up = 0
        tqdm_epoch = tqdm(range(args.num_train_epochs), desc="Epoch")
        for epoch in tqdm_epoch:
            model.train()

            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]

                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

            # train_loss, train_accuracy = evaluate(model, device, train_dataloader, desc="Evaluate Train")
            dev_loss, dev_accuracy = evaluate(model,
                                              device,
                                              dev_dataloader,
                                              desc="Evaluate Dev")
            test_loss, test_accuracy = evaluate(model,
                                                device,
                                                test_dataloader,
                                                desc="Evaluate Test")

            train_loss = tr_loss / nb_tr_steps if args.do_train else None

            if dev_accuracy >= best_dev_accuracy:
                # New best model.
                best_dev_accuracy = dev_accuracy
                test_acc_best_dev = test_accuracy
                best_dev_epoch = epoch + 1
                no_up = 0

                # Save the new best model.
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.output_dir,
                                                 "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
            else:
                no_up += 1

            tqdm.write("\t ***** Eval results (Epoch %s) *****" %
                       str(epoch + 1))
            # tqdm.write("\t train_accuracy = %s" % str(train_accuracy))
            tqdm.write("\t dev_accuracy = %s" % str(dev_accuracy))
            tqdm.write("")
            tqdm.write("\t best_dev_accuracy = %s" % str(best_dev_accuracy))
            tqdm.write("\t test_acc_best_dev = %s" % str(test_acc_best_dev))
            tqdm.write("\t best_dev_epoch = %s" % str(best_dev_epoch))
            tqdm.write("\t no_up = %s" % str(no_up))
            tqdm.write("")

            if no_up >= 10:
                tqdm_epoch.close()
                break
def main():
    # Pre-train model: eval_ppl = 104.29582476475977
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    # args = parser.parse_args()
    args = parser.parse_args([  #'--do_train',
        '--do_eval', '--dataset=../data/convai2/train_both_original.txt',
        '--dataset=data/convai2/convai2_data.models',
        '--output_dir=./language-quality-subreward/gpt_output/'
    ])
    print(args)

    # This commented code was used for parsing and pickling data from the original data file.
    '''
    data = Parser(persona_limit=None, set_relation=None)
    print('Parsing...')
    data.parse(args.dataset)
    file_utils.save_model('data/convai2', data, '.models', 'convai2_data')
    '''
    data = file_utils.read_model('', args.dataset, '')
    data = list(chain(*data.conversation))
    #data = data[: 10]
    train_data_org = data[:int(0.9 * len(data))]
    eval_data_org = data[int(0.9 * len(data)):]
    del data
    print('')

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, cache_dir="./cache/", special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTLMHeadModel.from_pretrained(
        args.model_name,
        cache_dir="./cache/",
        num_special_tokens=len(special_tokens))
    model.to(device)
    '''
    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)
    '''
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = train_data_org
    eval_dataset = eval_data_org

    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(sent[:max_length]) + 2  \
                           for dataset in encoded_datasets for sent in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(
        train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, lm_labels = batch
                loss = model(input_ids, lm_labels=lm_labels)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

    # Save a trained model
    if args.do_train:
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        config = model.config
        torch.save(model_to_save.state_dict(), output_model_file)

        # Yue: save the config:
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

        # Load a trained model that you have fine-tuned
        '''
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTLMHeadModel(config)
        model.load_state_dict(model_state_dict)
        model.to(device)
        '''

    if args.do_eval:
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        config = OpenAIGPTConfig(output_config_file)

        # Load a trained model that you have fine-tuned
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTLMHeadModel(config)
        model.load_state_dict(model_state_dict)
        model.to(device)

        model.eval()

        eval_ppl = 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, lm_labels = batch
            loss = model(input_ids, lm_labels=lm_labels)
            eval_ppl += math.exp(loss.item())
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1
        eval_ppl = eval_ppl / nb_eval_steps
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {'eval_ppl': eval_ppl, 'train_loss': train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))