Example #1
0
def create_optim(model, args, rl=False):
    """
    Not applied.
    :param model:
    :param args:
    :param rl:
    :return:
    """
    if not rl:
        optimizer = OpenAIAdam(model.parameters(), lr=args.lr)
    else:
        optimizer = OpenAIAdam(model.parameters(), lr=args.reinforce_lr)
    return optimizer
Example #2
0
def get_optimizer(model: GPT2LMHeadModel, data_loader: Any, num_epochs: int,
                  lr: float):
    params = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in params if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params': [p for n, p in params if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(data_loader) * num_epochs

    optimizer = OpenAIAdam(
        optimizer_grouped_parameters,
        lr=lr,
        t_total=num_train_optimization_steps,

        # the following group of parameters is taken from train_gpt2.py
        warmup=0.002,
        max_grad_norm=1.0,
        weight_decay=0.01,
        schedule="warmup_linear",
        b2=.99)
    return optimizer
Example #3
0
 def test_openai_sched_init(self):
     m = torch.nn.Linear(50, 50)
     optim = OpenAIAdam(m.parameters(),
                        lr=0.001,
                        warmup=.1,
                        t_total=1000,
                        schedule=None)
     self.assertTrue(
         isinstance(optim.param_groups[0]["schedule"], ConstantLR))
     optim = OpenAIAdam(m.parameters(),
                        lr=0.001,
                        warmup=.1,
                        t_total=1000,
                        schedule="none")
     self.assertTrue(
         isinstance(optim.param_groups[0]["schedule"], ConstantLR))
     optim = OpenAIAdam(m.parameters(), lr=0.001, warmup=.01, t_total=1000)
     self.assertTrue(
         isinstance(optim.param_groups[0]["schedule"],
                    WarmupLinearSchedule))
Example #4
0
def get_optimizer(model, args, data_loader):
    # We use OpenAIAdam because that's what run_openai_gpt used
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(data_loader) * args.num_train_epochs

    if args.optimizer == 'openai':
        optimizer = OpenAIAdam(
            optimizer_grouped_parameters,
            lr=args.learning_rate,
            warmup=args.warmup_proportion,
            max_grad_norm=args.max_grad_norm,
            weight_decay=args.weight_decay,
            schedule=args.lr_schedule,
            b2=.99,  # instead of .999
            t_total=num_train_optimization_steps)
    else:
        optimizer = torch.optim.Adam(optimizer_grouped_parameters,
                                     lr=args.learning_rate,
                                     betas=(0.9, 0.99),
                                     eps=1e-08,
                                     weight_decay=args.weight_decay,
                                     amsgrad=False)
        optimizer.get_lr = lambda: [p['lr'] for p in optimizer.param_groups]
    return optimizer
Example #5
0
def prep_optimizer(model, epochs, learning_rate, warmup_proportion, max_grad_norm, weight_decay):
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    num_train_optimization_steps = len(train_dataloader) * epochs
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                            lr=learning_rate,
                            warmup=warmup_proportion,
                            max_grad_norm=max_grad_norm,
                            weight_decay=weight_decay,
                            t_total=num_train_optimization_steps)

    return optimizer
Example #6
0
    def __init__(self, model, opt):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        parameters_with_decay = []
        parameters_with_decay_names = []
        parameters_without_decay = []
        parameters_without_decay_names = []
        base_parameters = []
        base_parameters_names = []

        for n, p in model.named_parameters():
            if p.requires_grad:
                # fine-tune BERT
                if any(t in n for t in ["transformer"]):
                    if any(t in n for t in no_decay):
                        parameters_without_decay.append(p)
                        parameters_without_decay_names.append(n)
                    else:
                        parameters_with_decay.append(p)
                        parameters_with_decay_names.append(n)
                else:
                    base_parameters.append(p)
                    base_parameters_names.append(n)

        weight_decay = opt['weight_decay']
        bert_learning_rate = opt['gpt_lr']
        base_learning_rate = opt['lr']
        optimizer_grouped_parameters = [
            {'params': parameters_with_decay, 'weight_decay': weight_decay, 'lr': bert_learning_rate},
            {'params': parameters_without_decay, 'weight_decay': 0.0, 'lr': bert_learning_rate},
            {'params': base_parameters, 'weight_decay': weight_decay, 'lr': base_learning_rate}
        ]
        #
        print('The following parameters will be optimized WITH decay:')
        print(_ellipse(parameters_with_decay_names, 5, ' , '))
        print('The following parameters will be optimized WITHOUT decay:')
        print(_ellipse(parameters_without_decay_names, 5, ' , '))
        print('The following parameters will be optimized NORMALLY:')
        print(_ellipse(base_parameters_names, 5, ' , '))

        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=opt['gpt_lr'],
                               warmup=opt['warmup_proportion'],
                               max_grad_norm=opt['gradient_clip'],
                               t_total=opt.get('optimizer_step', -1))
        self.optimizer = optimizer
    def buildOptimizer(self,
                       neural,
                       epochs,
                       batch_size,
                       accumulation_steps,
                       lr=2e-5,
                       warmup=0.05):
        """

    build bert optimizer
  
    """
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

        param_optimizer = list(neural.named_parameters())

        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        num_train_optimization_steps = int(epochs * len(self.sentences) /
                                           batch_size / accumulation_steps)

        if self.optimizer == 'BertAdam':
            return BertAdam(optimizer_grouped_parameters,
                            lr=lr,
                            warmup=warmup,
                            t_total=num_train_optimization_steps)
        else:
            return OpenAIAdam(optimizer_grouped_parameters,
                              lr=lr,
                              warmup=warmup,
                              t_total=num_train_optimization_steps)
Example #8
0
def train_model(epochs=10,
                num_gradients_accumulation=4,
                batch_size=4,
                gpu_id=0,
                lr=1e-5,
                load_dir='decoder_model'):
    # make sure your model is on GPU
    device = torch.device(f"cuda:{gpu_id}")

    #------------------------LOAD MODEL-----------------
    print('load the model....')
    encoder = TransformerEncoder()
    decoder = TransformerDecoderLM()

    encoder.load_state_dict(torch.load("encoder.pth"))
    decoder.load_state_dict(torch.load("decoder.pth"))

    encoder = encoder.to(device)
    decoder = decoder.to(device)

    print('load success')
    #------------------------END LOAD MODEL--------------

    #------------------------LOAD TRAIN DATA------------------
    train_data = torch.load("train_data.pth")
    train_dataset = TensorDataset(*train_data)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  shuffle=True,
                                  batch_size=batch_size)
    val_data = torch.load("validate_data.pth")
    val_dataset = TensorDataset(*val_data)
    val_dataloader = DataLoader(dataset=val_dataset,
                                shuffle=True,
                                batch_size=batch_size)
    #------------------------END LOAD TRAIN DATA--------------

    #------------------------SET OPTIMIZER-------------------
    num_train_optimization_steps = len(
        train_dataset) * epochs // batch_size // num_gradients_accumulation

    param_optimizer = list(decoder.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=lr,
                           warmup=0.01,
                           max_grad_norm=1.0,
                           weight_decay=0.01,
                           t_total=num_train_optimization_steps)
    #------------------------END SET OPTIMIZER--------------

    #------------------------START TRAINING-------------------
    update_count = 0

    start = time.time()
    print('start training....')
    for epoch in range(epochs):
        #------------------------training------------------------
        decoder.train()
        losses = 0
        times = 0
        for batch in train_dataloader:
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            _, past = encoder(encoder_input, mask_encoder_input)

            mask = torch.cat([mask_encoder_input, mask_decoder_input], dim=1)
            logits, _ = decoder(decoder_input, mask, past=past, past_length=0)

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()

            loss = util.sequence_cross_entropy_with_logits(out,
                                                           target,
                                                           target_mask,
                                                           average="token")
            loss.backward()

            losses += loss.item()
            times += 1

            update_count += 1

            if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
                optimizer.step()
                optimizer.zero_grad()
        end = time.time()
        print('-' * 20 + f'epoch {epoch}' + '-' * 20)
        print(f'time: {(end - start)}')
        print(f'loss: {losses / times}')
        start = end

        #------------------------validate------------------------
        decoder.eval()

        perplexity = 0
        batch_count = 0
        print('start calculate the perplexity....')

        with torch.no_grad():
            for batch in val_dataloader:
                batch = [item.to(device) for item in batch]

                encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

                _, past = encoder(encoder_input, mask_encoder_input)

                mask = torch.cat([mask_encoder_input, mask_decoder_input],
                                 dim=1)
                logits, _ = decoder(decoder_input,
                                    mask,
                                    past=past,
                                    past_length=0)

                out = logits[:, :-1].contiguous()
                target = decoder_input[:, 1:].contiguous()
                target_mask = mask_decoder_input[:, 1:].contiguous()

                loss = util.sequence_cross_entropy_with_logits(out,
                                                               target,
                                                               target_mask,
                                                               average="token")
                perplexity += np.exp(loss.item())
                batch_count += 1

        print(f'validate perplexity: {perplexity / batch_count}')

        torch.save(
            decoder.state_dict(),
            os.path.join(os.path.abspath('.'), load_dir,
                         str(epoch) + "decoder.pth"))
# optimizer
num_epochs = 10
num_gradients_accumulation = 1
num_train_optimization_steps = num_train_optimization_steps = len(train_dataset) * num_epochs // batch_size // num_gradients_accumulation

param_optimizer = list(model_A.named_parameters()) 
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]


optimizer = OpenAIAdam(optimizer_grouped_parameters,
                       lr=2e-5,
                       warmup=0.1,
                       max_grad_norm=1.0,
                       weight_decay=0.01,
                       t_total=num_train_optimization_steps)


# In[12]:


# support fp16
# [model_A, model_B], optimizer = amp.initialize([model_A, model_B], optimizer, opt_level="O1")


# In[13]:

import tqdm 
update_count = 0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)

    # Load and encode the datasets

    '''
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)
    '''
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_recipes_dataset(args.train_dataset)
    #eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset,)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the mex input length for the Transformer
    max_length = model.config.n_positions - 2
    input_length = max(len(story[:max_length]) + 2 for dataset in encoded_datasets for story in dataset)
    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
    train_tensor_dataset = tensor_datasets[0]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    '''
    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
    '''

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                print(batch)
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                loss = model(input_ids, mc_token_ids, lm_labels = lm_labels)
                print(loss)
                '''
                loss.backward()
                optimizer.step()
                tr_loss += loss.item()
                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])
                '''
    # Save a trained model
    '''
Example #11
0
###
params = list(filter(lambda x: x.requires_grad, model.parameters())) + list(criterion.parameters())
total_params = sum(p.data.nelement() for p in params if p.requires_grad)
if args.mode == 'GPT':
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'ln_'] # Add 'ln_1' to test if it's better
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay) and 'transformer' in n],
         'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay) and 'transformer' in n],
         'weight_decay': 0.0}
    ]
    num_train_optimization_steps = train_data.size(0) * args.epochs // args.batch_size
    optimizer_gpt = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)
    params = [p for n, p in param_optimizer if 'transformer' not in n]

tools.print_log(args.save, args)
tools.print_log(args.save, 'Model total parameters:{}'.format(total_params))
if args.mode == 'GPT':
    tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    with open('GPT_index.pkl', 'rb') as handle:
        gptdic = pickle.load(handle)


###############################################################################
# Training code
###############################################################################
Example #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='gpt2',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('{} is on use...'.format(device))
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name,
                                              special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    # model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
    model = GPT2DoubleHeadsModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    #     GPT2DoubleHeadsModel.set_num_special_tokens(model, len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = len(
            train_dataloader) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss = model(input_ids, mc_token_ids, lm_labels,
                                   mc_labels)
                _, mc_logits = model(input_ids, mc_token_ids)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default='/hdd/user4/gpt_classification/dataset/ag_news',
                        type=str,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--task_name",
                        default='ag_news',
                        type=str,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default='/hdd/user4/gpt_classification/experiment/ag_news',
                        type=str,
                        help="The output directory where the model predictions and checkpoints will be written.")

    parser.add_argument("--max_grad_norm",
                        default=1)
    parser.add_argument('--weight_decay', type=float, default=0.0)

    ## Other parameters
    parser.add_argument("--cache_dir",
                        default='/hdd/user4/gpt_classification/pretrained',
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        default=True,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=True,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=9.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--overwrite_output_dir',
                        default=True,
                        action='store_true',
                        help="Overwrite the content of the output directory")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        # n_gpu = torch.cuda.device_count()
        n_gpu = 1
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    args.device = device

    logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt='%m/%d/%Y %H:%M:%S',
                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name](args.data_dir)
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()
    num_labels = len(label_list)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
    model = OpenAIGPTForClassification.from_pretrained(args.model_name,
                                                       num_special_tokens=len(special_tokens),
                                                       num_labels=num_labels)
    if args.local_rank == 0:
        torch.distributed.barrier()

    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model,
                                                          device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    global_step = 0
    tr_loss = 0

    if args.do_train:
        if args.local_rank in [-1, 0]:
            tb_writer = SummaryWriter()

        # Prepare data loader
        train_examples = processor.get_train_examples()
        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
            list(filter(None, args.model_name.split('/'))).pop(),
            str(args.max_seq_length),
            str(task_name)))
        try:
            with open(cached_train_features_file, "rb") as reader:
                train_features = pickle.load(reader)
        except:
            train_features = convert_examples_to_features(
                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving train features into cached file %s", cached_train_features_file)
                with open(cached_train_features_file, "wb") as writer:
                    pickle.dump(train_features, writer)

        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)

        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        # Prepare optimizer
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        model.train()
        for _ in range(int(args.num_train_epochs)):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, _, label_ids = batch

                # define a new function to compute loss values for both output_modes
                logits = model.forward(input_ids, input_mask)

                if output_mode == "classification":
                    loss_fct = CrossEntropyLoss()
                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
                elif output_mode == "regression":
                    loss_fct = MSELoss()
                    loss = loss_fct(logits.view(-1), label_ids.view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                    if args.local_rank in [-1, 0]:
                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
                        tb_writer.add_scalar('loss', loss.item(), global_step)

        tb_writer.close()

    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)
        # Good practice: save your training arguments together with the trained model
        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
        torch.save(args, output_args_file)

    # Load a trained model and vocabulary that you have fine-tuned
    model = OpenAIGPTForClassification.from_pretrained(args.output_dir,
                                                       num_labels=num_labels)

    model.to(device)

    ### Evaluation
    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples()
        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
            list(filter(None, args.model_name.split('/'))).pop(),
            str(args.max_seq_length),
            str(task_name)))
        try:
            with open(cached_eval_features_file, "rb") as reader:
                eval_features = pickle.load(reader)
        except:
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
                with open(cached_eval_features_file, "wb") as writer:
                    pickle.dump(eval_features, writer)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)

        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        out_label_ids = None


        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model.forward(input_ids, input_mask)

            # create eval loss and other metric required by the task
            if output_mode == "classification":
                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
            elif output_mode == "regression":
                loss_fct = MSELoss()
                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))

            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
                out_label_ids = label_ids.detach().cpu().numpy()
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        preds = preds[0]
        if output_mode == "classification":
            output_odp = []
            for arr in preds:
                t = (-arr).argsort()[:5]
                output_odp.append(t.tolist())
            file_path = 'D:/바탕화면/(논문)multi-pretraining/NYT'
            with open('gpt_top5.pkl','wb') as f:
                pickle.dump(output_odp,f)


            preds = np.argmax(preds, axis=1)
        elif output_mode == "regression":
            preds = np.squeeze(preds)
        result = compute_metrics(task_name, preds, out_label_ids)
        print('preds:',preds,'label:',out_label_ids)

        loss = tr_loss / global_step if args.do_train else None

        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        result['loss'] = loss

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        # hack for MNLI-MM
        if task_name == "mnli":
            task_name = "mnli-mm"
            processor = processors[task_name]()

            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
            if not os.path.exists(args.output_dir + '-MM'):
                os.makedirs(args.output_dir + '-MM')

            eval_examples = processor.get_dev_examples(args.data_dir)
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
            logger.info("***** Running evaluation *****")
            logger.info("  Num examples = %d", len(eval_examples))
            logger.info("  Batch size = %d", args.eval_batch_size)
            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)

            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
            # Run prediction for full data
            eval_sampler = SequentialSampler(eval_data)
            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

            model.eval()
            eval_loss = 0
            nb_eval_steps = 0
            preds = []
            out_label_ids = None

            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
                input_ids = input_ids.to(device)
                input_mask = input_mask.to(device)
                segment_ids = segment_ids.to(device)
                label_ids = label_ids.to(device)

                with torch.no_grad():
                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)

                loss_fct = CrossEntropyLoss()
                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))

                eval_loss += tmp_eval_loss.mean().item()
                nb_eval_steps += 1
                if len(preds) == 0:
                    preds.append(logits.detach().cpu().numpy())
                    out_label_ids = label_ids.detach().cpu().numpy()
                else:
                    preds[0] = np.append(
                        preds[0], logits.detach().cpu().numpy(), axis=0)
                    out_label_ids = np.append(
                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)

            eval_loss = eval_loss / nb_eval_steps
            preds = preds[0]
            preds = np.argmax(preds, axis=1)
            result = compute_metrics(task_name, preds, out_label_ids)

            loss = tr_loss / global_step if args.do_train else None

            result['eval_loss'] = eval_loss
            result['global_step'] = global_step
            result['loss'] = loss

            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    logger.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument('--task', type=str, default='intent',
                        choices=['intent', 'slot'], help="Intent or slot prediction")
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.0)
    parser.add_argument('--probabilistic_masks', action='store_true')
    parser.add_argument('--attn_bias', action='store_true')
    parser.add_argument('--linearize', action='store_true')
    args = parser.parse_args()
    print(args)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    label_list = list()
    for line in open(LABEL_FILES[args.task]):
        label_list.append(line.strip())

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTDoubleHeadsClsModel.from_pretrained(args.model_name, num_labels=len(label_list), num_special_tokens=len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        elif isinstance(obj, np.ndarray):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_atis_dataset(args.train_dataset, label_list, tokenizer, args.probabilistic_masks, args.linearize)
    eval_dataset = load_atis_dataset(args.eval_dataset, label_list, tokenizer, args.probabilistic_masks, args.linearize, plot=False)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions - 2
    input_length = max(len(utt[:max_length]) + 2  \
                           for dataset in encoded_datasets for utt, _, _, _, _ in dataset)
    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids, len(label_list))
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        results = []
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None)
                # loss = args.lm_coef * losses[0] + losses[1]
                loss = losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0])

            model.eval()
            eval_loss = 0
            nb_eval_steps, nb_eval_examples = 0, 0
            all_logits, all_labels = [], []
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch
                with torch.no_grad():
                    _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None)
                    _, mc_logits = model(input_ids, mc_token_ids, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None)

                mc_logits = mc_logits.detach().cpu().numpy()
                mc_labels = mc_labels.to('cpu').numpy()

                eval_loss += mc_loss.mean().item()
                all_logits.append(mc_logits)
                all_labels.append(mc_labels)

                nb_eval_examples += input_ids.size(0)
                nb_eval_steps += 1

            eval_loss = eval_loss / nb_eval_steps
            all_logits = np.concatenate(all_logits, axis=0)
            all_labels = np.concatenate(all_labels, axis=0)
            eval_f1 = f1(all_logits, all_labels)
            eval_acc = accuracy(all_logits, all_labels) / nb_eval_examples
            train_loss = tr_loss/nb_tr_steps if args.do_train else None
            result = {'eval_loss': eval_loss,
                      'eval_f1': eval_f1,
                      'eval_accuracy': eval_acc,
                      'train_loss': train_loss}
            print(result)
            results.append(result)

        with open(os.path.join(args.output_dir, "log.csv"), "w") as csvfile:
            writer = csv.DictWriter(
                csvfile,
                ["train_loss", "eval_loss", "eval_accuracy", "eval_f1"]
            )
            writer.writeheader()
            writer.writerows(results)

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsClsModel.from_pretrained(
            args.output_dir,num_labels=len(label_list), num_special_tokens=len(special_tokens))
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss = 0
        nb_eval_steps, nb_eval_examples = 0, 0
        all_logits, all_labels = [], []
        fw = open("prediction.txt", "w")
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, pos_ids, attn_bias, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None)
                _, mc_logits = model(input_ids, mc_token_ids, position_ids=pos_ids, attn_bias=attn_bias if args.attn_bias else None)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()

            for i, (o, l) in enumerate(zip((mc_logits>=0.5).astype(np.int32), mc_labels.astype(np.int32))):
                # if np.any(o != l):
                # pred = [label_list[idx] for idx, val in enumerate(o) if val == 1]
                # true = [label_list[idx] for idx, val in enumerate(l) if val == 1]
                pred = o
                true = l
                fw.write(f"{eval_dataset[nb_eval_examples+i][0]}\n{pred}\n{true}\n\n")

            eval_loss += mc_loss.mean().item()
            all_logits.append(mc_logits)
            all_labels.append(mc_labels)

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        fw.close()
        eval_loss = eval_loss / nb_eval_steps
        all_logits = np.concatenate(all_logits, axis=0)
        all_labels = np.concatenate(all_labels, axis=0)
        eval_f1 = f1(all_logits, all_labels)
        eval_acc = accuracy(all_logits, all_labels) / nb_eval_examples
        train_loss = tr_loss/nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss,
                  'eval_f1': eval_f1,
                  'eval_accuracy': eval_acc,
                  'train_loss': train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Example #15
0
def train(seed, depth, maxlen, batch_size, accumulation_steps, model_name):

    config.seed = seed
    config.max_sequence_length = maxlen
    config.batch_size = batch_size
    config.accumulation_steps = accumulation_steps
    if depth != 24:
        config.bert_weight = f"../bert_weight/uncased_L-{depth}_H-768_A-12/"
    else:
        config.bert_weight = f"../bert_weight/uncased_L-{depth}_H-1024_A-16/"
    if model_name == 'bert':
        config.features = f"../bert_features_{maxlen}/"
    elif model_name == 'gpt2':
        config.features = f"../features_{maxlen}_gpt/"
    else:
        config.features = f"../features_{maxlen}_xlnet/"
    config.experiment = f"{depth}layers"
    config.checkpoint = f"{config.logdir}/{config.today}/{model_name}_{config.experiment}_" \
                        f"{config.batch_size}bs_{config.accumulation_steps}accum_{config.seed}seed_{config.max_sequence_length}/"

    print_config(config)

    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    torch.cuda.manual_seed(config.seed)
    torch.backends.cudnn.deterministic = True

    # Data loaders
    train_loader, valid_loader, valid_df, loss_weight = get_data_loaders(
        config)
    loaders = {"train": train_loader, "valid": valid_loader}

    # Criterion
    criterion = CustomLoss(loss_weight)

    # Model and optimizer
    if model_name == 'bert':
        print("BERT MODEL")
        model = BertForTokenClassificationMultiOutput2.from_pretrained(
            config.bert_weight,
            cache_dir=None,
            num_aux_labels=config.n_aux_targets)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = np.ceil(
            len(train_loader.dataset) / config.batch_size /
            config.accumulation_steps) * config.epochs
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=config.lr,
                             warmup=0.01,
                             t_total=num_train_optimization_steps)

    elif model_name == 'gpt2':
        print("GPT2 MODEL")
        model = GPT2ClassificationMultioutput.from_pretrained(
            config.gpt2_weight,
            cache_dir=None,
            num_aux_labels=config.n_aux_targets)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = np.ceil(
            len(train_loader.dataset) / config.batch_size /
            config.accumulation_steps) * config.epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=config.lr,
                               warmup=0.01,
                               t_total=num_train_optimization_steps)
    elif model_name == 'xlnet':
        model = XLNetWithMultiOutput.from_pretrained(
            config.xlnet_weight,
            clf_dropout=0.4,
            n_class=6
            # num_aux_labels=config.n_aux_targets
        )

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = np.ceil(
            len(train_loader.dataset) / config.batch_size /
            config.accumulation_steps) * config.epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=config.lr,
                               warmup=0.01,
                               t_total=num_train_optimization_steps)
    else:
        raise ("Model is not implemented")

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer)
    model = model.cuda()

    from apex import amp
    model, optimizer = amp.initialize(model, optimizer, opt_level="O1")

    # if distributed_rank > -1:
    # from apex.parallel import DistributedDataParallel
    # model = DistributedDataParallel(model)
    model = torch.nn.DataParallel(model)

    if config.resume:
        checkpoint = torch.load(config.checkpoint + "/checkpoints/best.pth")
        import pdb
        pdb.set_trace()
        new_state_dict = {}
        old_state_dict = checkpoint['model_state_dict']
        for k, v in old_state_dict.items():
            new_state_dict["module." + k] = v
        model.load_state_dict(new_state_dict)
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        criterion.load_state_dict(checkpoint['criterion_state_dict'])
        print("!!! Loaded checkpoint ",
              config.checkpoint + "/checkpoints/best.pth")

    identity_valid = valid_df[config.identity_columns].copy()
    target_valid = valid_df.target.values
    auc_callback = AucCallback(identity=identity_valid, target=target_valid)

    checkpoint_callback = IterationCheckpointCallback(
        save_n_last=2000,
        num_iters=10000,
    )

    # model runner
    runner = ModelRunner()

    # model training
    runner.train(model=model,
                 criterion=criterion,
                 optimizer=optimizer,
                 scheduler=scheduler,
                 loaders=loaders,
                 main_metric='auc',
                 minimize_metric=False,
                 logdir=config.checkpoint,
                 num_epochs=config.epochs,
                 verbose=True,
                 fp16={"opt_level": "O1"},
                 callbacks=[auc_callback, checkpoint_callback])
Example #16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .csv files (or other data files) for the task."
    )
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--answer_only",
        default=False,
        action='store_true',
        help="Whether to run with answers only (blank out question).")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument(
        "--load_model_from",
        default=None,
        type=str,
        help=
        "The saved model file to load before doing any training or eval (if both --do_train and --do_eval are specified, the saved model will be loaded, then trained, then the trained model will be evaluated)."
    )
    parser.add_argument(
        '--train_filename',
        type=str,
        default='train.csv',
        help="Filename to load train data from (relative to data_dir)")
    parser.add_argument(
        '--eval_filename',
        type=str,
        default='val.csv',
        help="File to load eval data from (relative to data_dir)")
    parser.add_argument(
        '--data_format',
        type=str,
        choices=['swag', 'codah'],
        default='swag',
        help=
        "Format of the train and eval files (original SWAG CSV format vs our TSV format)"
    )
    parser.add_argument(
        '--model_labels_save_filename',
        type=str,
        default='model_labels.json',
        help=
        "JSON file to save model outputs/labels to (relative to output_dir)")
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=32)
    parser.add_argument('--eval_batch_size', type=int, default=8)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.5)
    parser.add_argument('--n_valid', type=int, default=374)
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=8,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    if args.do_eval and (not args.do_train) and args.load_model_from is None:
        args.load_model_from = os.path.join(args.output_dir,
                                            'pytorch_model.bin')

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))

    config = model.config
    if args.load_model_from:
        model_state_dict = torch.load(args.load_model_from)
        model = OpenAIGPTDoubleHeadsModel(config)
        model.load_state_dict(model_state_dict)
    model.to(device)

    # Load and encode the datasets
    logger.info("Loading datasets...")
    datasets = []
    dataset_keys = dict()
    if args.do_train:
        train_dataset = read_swag_examples(os.path.join(
            args.data_dir, args.train_filename),
                                           is_training=True,
                                           answer_only=args.answer_only,
                                           data_format=args.data_format)
        train_dataset = [
            EncodedSwagExample(ex, tokenizer)
            for ex in tqdm(train_dataset, desc='Encoding train')
        ]
        dataset_keys['train'] = len(datasets)
        datasets.append(train_dataset)

    if args.do_eval:
        eval_dataset = read_swag_examples(os.path.join(args.data_dir,
                                                       args.eval_filename),
                                          is_training=True,
                                          answer_only=args.answer_only,
                                          data_format=args.data_format)
        eval_dataset = [
            EncodedSwagExample(ex, tokenizer)
            for ex in tqdm(eval_dataset, desc='Encoding eval')
        ]
        dataset_keys['eval'] = len(datasets)
        datasets.append(eval_dataset)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(swagex.context_tokens[:max_length]) + len(swagex.start_ending_tokens[:max_length]) + max(len(ending[:max_length]) for ending in swagex.endings_tokens) + 3  \
                           for dataset in datasets for swagex in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model
    print('---')
    print('Input length: {}\n'.format(input_length))
    print('---')

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(datasets, input_length, max_length,
                                           *special_tokens_ids)
    if args.do_train:
        train_tensor_dataset = tensor_datasets[dataset_keys['train']]
    if args.do_eval:
        eval_tensor_dataset = tensor_datasets[dataset_keys['eval']]

    # Prepare optimizer
    if args.do_train:
        train_data = TensorDataset(*train_tensor_dataset)
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        #num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
        num_train_optimization_steps = int(
            len(train_data) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]

                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                nb_tr_steps += 1
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()

    # Save a trained model
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        torch.save(model_to_save.state_dict(), output_model_file)

    if args.do_eval:
        eval_data = TensorDataset(*eval_tensor_dataset)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        # Load a trained model that you have fine-tuned
        if args.do_train:
            model_state_dict = torch.load(output_model_file)
            model = OpenAIGPTDoubleHeadsModel(config)
            model.load_state_dict(model_state_dict)
            model.to(device)
        model.eval()

        all_model_outputs = []
        data_index = 0

        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss = model(input_ids, mc_token_ids, lm_labels,
                                   mc_labels)
                _, mc_logits = model(input_ids, mc_token_ids)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            for i in range(input_ids.size(0)):
                output_obj = dict()
                output_obj['logits'] = [float(x) for x in mc_logits[i]]
                output_obj['true_label'] = int(mc_labels[i])
                output_obj['model_label'] = int(np.argmax(mc_logits[i]))
                output_obj['swag_data'] = datasets[
                    dataset_keys['eval']][data_index].raw_example.to_dict()
                all_model_outputs.append(output_obj)
                data_index += 1

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        with open(
                os.path.join(args.output_dir, args.model_labels_save_filename),
                'w') as f:
            json.dump(all_model_outputs, f)
Example #17
0
    def train(self, which_to_train, num_epochs=10):
        # optimizer
        param_optimizer = list(self.model_A.named_parameters()) + \
                          list(self.model_B.named_parameters())
        if "A" in which_to_train:
            print("clf_A to optimize")
            param_optimizer += list(self.clf_A.named_parameters()) 
        if "B" in which_to_train:
            print("clf_B to optimize")
            param_optimizer += list(self.clf_B.named_parameters()) 
        if "TF" in which_to_train:
            print("clf_TF to optimize")
            param_optimizer += list(self.clf_TF.named_parameters()) 

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        num_train_optimization_steps = len(self.train_dataset) * num_epochs // self.batch_size // self.num_gradients_accumulation

        self.optimizer = OpenAIAdam(optimizer_grouped_parameters,
                            lr=2e-5,
                            warmup=0.1,
                            max_grad_norm=1.0,
                            weight_decay=0.01,
                            t_total=num_train_optimization_steps)


        update_count = 0
        progress_bar = tqdm.tqdm
        start = time.time()
        best_acc_A = -float('Inf')
        best_f1_A = -float('Inf')
        best_acc_B = -float('Inf')
        best_f1_B = -float('Inf')
        best_acc_TF = -float('Inf')
        best_f1_TF = -float('Inf')


        for ep in tqdm.tqdm(range(num_epochs)):
            
            # set train mode
            self.model_A.train()
            self.model_B.train()
            self.clf_A.train()
            self.clf_B.train()
            self.clf_TF.train()
            
            "Training"
            pbar = progress_bar(self.train_dataloader)
            train_dataloader_TF_list = list(self.train_dataloader_TF)
            
            for i, batch in enumerate(pbar):
                batch = batch[0]
                batch_TF = train_dataloader_TF_list[i%len(train_dataloader_TF_list)]
                # without relative position
                # if sum([len(item) for item in batch[1]]) > 1024:
                #     input("1024 here!")
                #     continue

                record_loss = self.train_one_iter(batch, batch_TF, update_count, which_to_train, fp16=False)
                update_count += 1

                if update_count % self.num_gradients_accumulation == self.num_gradients_accumulation - 1:
                    # update for gradient accumulation
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    
                    # speed measure
                    end = time.time()
                    speed = self.batch_size * self.num_gradients_accumulation / (end - start)
                    start = end
                    
                    # show progress
                    pbar.set_postfix(loss=record_loss, speed=speed)

            "Evaluation"
            self.model_A.eval()
            self.model_B.eval()
            self.clf_A.eval()
            self.clf_B.eval()
            self.clf_TF.eval()

            (val_acc_A, val_f1_A), (val_acc_B, val_f1_B), (val_acc_TF, val_f1_TF) = self.validate(self.val_dataloader, self.val_dataloader_TF, ep, which_to_train)
            print(f"A: val f1: {val_f1_A}, valid acc: {val_acc_A}")
            print(f"B: val f1: {val_f1_B}, valid acc: {val_acc_B}")
            print(f"TF: val f1: {val_f1_TF}, valid acc: {val_acc_TF}")
            is_best_so_far_TF = val_f1_TF > best_f1_TF
            is_best_so_far_A = val_f1_A > best_f1_A
            is_best_so_far_B = val_f1_TF > best_f1_B
            
            if is_best_so_far_TF:
                best_acc_TF = val_acc_TF
                best_f1_TF = val_f1_TF
            if is_best_so_far_A:
                best_acc_A = val_acc_A
                best_f1_A = val_f1_A
            if is_best_so_far_B:
                best_acc_B = val_acc_B
                best_f1_B = val_f1_B
            SAVED = False
            if is_best_so_far_TF and not SAVED:
                SAVED = True
                torch.save((self.model_A.state_dict(), self.model_B.state_dict(), 
                            self.clf_A.state_dict(), self.clf_B.state_dict(), 
                            self.clf_TF.state_dict()),                             
                            f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth")
            if is_best_so_far_A and not SAVED:
                SAVED = True
                torch.save((self.model_A.state_dict(), self.model_B.state_dict(), 
                            self.clf_A.state_dict(), self.clf_B.state_dict(), 
                            self.clf_TF.state_dict()),                             
                            f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth")
            if is_best_so_far_B and not SAVED:
                SAVED = True
                torch.save((self.model_A.state_dict(), self.model_B.state_dict(), 
                            self.clf_A.state_dict(), self.clf_B.state_dict(), 
                            self.clf_TF.state_dict()),                             
                            f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth")
                 # if which_to_train == "A":
                #     torch.save(model_A.state_dict(), f"Checkpoint_act_clf/A/best_acc_{best_acc}_f1_{best_f1}.pth")
                # elif which_to_train == "B":
                #     torch.save(model_A.state_dict(), f"Checkpoint_act_clf/B/best_acc_{best_acc}_f1_{best_f1}.pth")
                # checkpointer.save_checkpoint(ep, model_A.state_dict(), {"None": None}, is_best_so_far)

        print("finally")
        print("A: \nbest acc: {}, best f1: {}".format(best_acc_A, best_f1_A))
        print("B: \nbest acc: {}, best f1: {}".format(best_acc_B, best_f1_B))
        print("TF: \nbest acc: {}, best f1: {}".format(best_acc_TF, best_f1_TF))
Example #18
0
class ModelClassifier(object):
    def __init__(self, config, which_to_train, model_A, model_B, tokenizer, device1, device2):
        # config.num_labels = le.classes_.shape[0]
        # label encode
        # super().__init__()
        self.config = config
        self.le_A = load_pkl("training/data/labelencoder_A.pkl")
        self.le_B = load_pkl("training/data/labelencoder_B.pkl")

        self.clf_A = SequenceSummary(num_labels=self.le_A.classes_.shape[0], config=config)
        self.clf_B = SequenceSummary(num_labels=self.le_B.classes_.shape[0], config=config)
        self.clf_TF = SequenceSummary(num_labels=2, config=config)
        
        # self.apply(self.init_weight)
        self.past = None
        self.history = []

        # model
        self.model_A = model_A
        self.model_B = model_B
        self.tokenizer = tokenizer
        self.cls_token_id = tokenizer.cls_token_id
        self.device1 = device1
        self.device2 = device2

        self.to_device(self.device1)

        # define loss
        self.criterion = nn.CrossEntropyLoss()
        
        # optimizer parameters
        self.num_gradients_accumulation = 1
        self.batch_size = 1
        self.batch_size_TF = 8

        # load training data
        self.load_data()

    def reload(self):
        self.past = None
        self.history = []

    def to_device(self, device):
        # to device
        self.clf_A = self.clf_A.to(device)
        self.clf_B = self.clf_B.to(device)
        self.clf_TF = self.clf_TF.to(device)

        self.clf_A.device = device
        self.clf_B.device = device
        self.clf_TF.device = device
        # self.model_A = self.model_A.to(self.device)
        # self.model_B = self.model_B.to(self.device)

    def load_data(self):
        # load training data
        self.train_data = load_pkl("training/data/train_data.pkl")
        self.val_data = load_pkl("training/data/val_data.pkl")
        self.train_data_TF, self.val_data_TF = torch.load("demonstration/old_model/demonstration_train_with_text_only.pkl", map_location="cpu"), \
                                                torch.load("demonstration/old_model/demonstration_val_with_text_only.pkl", map_location="cpu")

        self.train_dataset = PersuadeDataset(self.train_data, self.tokenizer)
        self.val_dataset = PersuadeDataset(self.val_data, self.tokenizer)

        self.train_dataset_TF, self.val_dataset_TF = TFDataset(self.train_data_TF, self.tokenizer), \
                                                        TFDataset(self.val_data_TF, self.tokenizer)

        self.train_dataloader = DataLoader(dataset=self.train_dataset, 
                                            shuffle=True, 
                                            batch_size=self.batch_size, 
                                            collate_fn=self.train_dataset.collate)
        self.val_dataloader = DataLoader(dataset=self.val_dataset, 
                                            shuffle=False, 
                                            batch_size=self.batch_size, 
                                            collate_fn=self.train_dataset.collate)

        self.train_dataloader_TF = DataLoader(dataset=self.train_dataset_TF, 
                                            shuffle=True, 
                                            batch_size=self.batch_size_TF, 
                                            collate_fn=self.train_dataset_TF.collate)
        self.val_dataloader_TF = DataLoader(dataset=self.val_dataset_TF, 
                                            shuffle=False, 
                                            batch_size=self.batch_size_TF, 
                                            collate_fn=self.val_dataset_TF.collate)


    def load_model(self, all_model_dir=None, clf_A_dir=None, clf_B_dir=None, clf_TF_dir=None):
        if all_model_dir is None:
            if clf_A_dir:
                clf_A_state = torch.load(clf_A_dir)
                self.clf_A.load_state_dict(clf_A_state)
                print(f"clf_A loaded")

            if clf_B_dir:
                clf_B_state = torch.load(clf_B_dir)
                self.clf_B.load_state_dict(clf_B_state)
                print(f"clf_B loaded")

            if clf_TF_dir:
                clf_TF_state = torch.load(clf_TF_dir)
                self.clf_TF.load_state_dict(clf_TF_state)
                print(f"clf_TF loaded")
        else:
            model_A_state, model_B_state, clf_A_state, clf_B_state, clf_TF_state = torch.load(all_model_dir)
            self.model_A.load_state_dict(model_A_state)
            self.model_B.load_state_dict(model_B_state)
            self.clf_A.load_state_dict(clf_A_state)
            self.clf_B.load_state_dict(clf_B_state)
            self.clf_TF.load_state_dict(clf_TF_state)
            print(f"all models loaded")


    def train(self, which_to_train, num_epochs=10):
        # optimizer
        param_optimizer = list(self.model_A.named_parameters()) + \
                          list(self.model_B.named_parameters())
        if "A" in which_to_train:
            print("clf_A to optimize")
            param_optimizer += list(self.clf_A.named_parameters()) 
        if "B" in which_to_train:
            print("clf_B to optimize")
            param_optimizer += list(self.clf_B.named_parameters()) 
        if "TF" in which_to_train:
            print("clf_TF to optimize")
            param_optimizer += list(self.clf_TF.named_parameters()) 

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        num_train_optimization_steps = len(self.train_dataset) * num_epochs // self.batch_size // self.num_gradients_accumulation

        self.optimizer = OpenAIAdam(optimizer_grouped_parameters,
                            lr=2e-5,
                            warmup=0.1,
                            max_grad_norm=1.0,
                            weight_decay=0.01,
                            t_total=num_train_optimization_steps)


        update_count = 0
        progress_bar = tqdm.tqdm
        start = time.time()
        best_acc_A = -float('Inf')
        best_f1_A = -float('Inf')
        best_acc_B = -float('Inf')
        best_f1_B = -float('Inf')
        best_acc_TF = -float('Inf')
        best_f1_TF = -float('Inf')


        for ep in tqdm.tqdm(range(num_epochs)):
            
            # set train mode
            self.model_A.train()
            self.model_B.train()
            self.clf_A.train()
            self.clf_B.train()
            self.clf_TF.train()
            
            "Training"
            pbar = progress_bar(self.train_dataloader)
            train_dataloader_TF_list = list(self.train_dataloader_TF)
            
            for i, batch in enumerate(pbar):
                batch = batch[0]
                batch_TF = train_dataloader_TF_list[i%len(train_dataloader_TF_list)]
                # without relative position
                # if sum([len(item) for item in batch[1]]) > 1024:
                #     input("1024 here!")
                #     continue

                record_loss = self.train_one_iter(batch, batch_TF, update_count, which_to_train, fp16=False)
                update_count += 1

                if update_count % self.num_gradients_accumulation == self.num_gradients_accumulation - 1:
                    # update for gradient accumulation
                    self.optimizer.step()
                    self.optimizer.zero_grad()
                    
                    # speed measure
                    end = time.time()
                    speed = self.batch_size * self.num_gradients_accumulation / (end - start)
                    start = end
                    
                    # show progress
                    pbar.set_postfix(loss=record_loss, speed=speed)

            "Evaluation"
            self.model_A.eval()
            self.model_B.eval()
            self.clf_A.eval()
            self.clf_B.eval()
            self.clf_TF.eval()

            (val_acc_A, val_f1_A), (val_acc_B, val_f1_B), (val_acc_TF, val_f1_TF) = self.validate(self.val_dataloader, self.val_dataloader_TF, ep, which_to_train)
            print(f"A: val f1: {val_f1_A}, valid acc: {val_acc_A}")
            print(f"B: val f1: {val_f1_B}, valid acc: {val_acc_B}")
            print(f"TF: val f1: {val_f1_TF}, valid acc: {val_acc_TF}")
            is_best_so_far_TF = val_f1_TF > best_f1_TF
            is_best_so_far_A = val_f1_A > best_f1_A
            is_best_so_far_B = val_f1_TF > best_f1_B
            
            if is_best_so_far_TF:
                best_acc_TF = val_acc_TF
                best_f1_TF = val_f1_TF
            if is_best_so_far_A:
                best_acc_A = val_acc_A
                best_f1_A = val_f1_A
            if is_best_so_far_B:
                best_acc_B = val_acc_B
                best_f1_B = val_f1_B
            SAVED = False
            if is_best_so_far_TF and not SAVED:
                SAVED = True
                torch.save((self.model_A.state_dict(), self.model_B.state_dict(), 
                            self.clf_A.state_dict(), self.clf_B.state_dict(), 
                            self.clf_TF.state_dict()),                             
                            f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth")
            if is_best_so_far_A and not SAVED:
                SAVED = True
                torch.save((self.model_A.state_dict(), self.model_B.state_dict(), 
                            self.clf_A.state_dict(), self.clf_B.state_dict(), 
                            self.clf_TF.state_dict()),                             
                            f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth")
            if is_best_so_far_B and not SAVED:
                SAVED = True
                torch.save((self.model_A.state_dict(), self.model_B.state_dict(), 
                            self.clf_A.state_dict(), self.clf_B.state_dict(), 
                            self.clf_TF.state_dict()),                             
                            f"Checkpoint_act_clf/epoch{ep}_multitask_TF_best_acc_{val_acc_TF}_f1_{val_f1_TF}_A_acc_{val_acc_A}_f1_{val_f1_A}_B_acc_{val_acc_B}_f1_{val_f1_B}.pth")
                 # if which_to_train == "A":
                #     torch.save(model_A.state_dict(), f"Checkpoint_act_clf/A/best_acc_{best_acc}_f1_{best_f1}.pth")
                # elif which_to_train == "B":
                #     torch.save(model_A.state_dict(), f"Checkpoint_act_clf/B/best_acc_{best_acc}_f1_{best_f1}.pth")
                # checkpointer.save_checkpoint(ep, model_A.state_dict(), {"None": None}, is_best_so_far)

        print("finally")
        print("A: \nbest acc: {}, best f1: {}".format(best_acc_A, best_f1_A))
        print("B: \nbest acc: {}, best f1: {}".format(best_acc_B, best_f1_B))
        print("TF: \nbest acc: {}, best f1: {}".format(best_acc_TF, best_f1_TF))

    def validate(self, dataloader, dataloader_TF, ep, which_to_train):
        from sklearn.metrics import f1_score
        from sklearn.metrics import confusion_matrix
        from utils import print_cm

        # evaluation mode
        self.model_A.eval()
        self.model_B.eval()
        self.clf_A.eval()
        self.clf_B.eval()
        self.clf_TF.eval()

        def get_numbers_for_one_task(sents, logits, acts, x, y_true, y_pred, total, correct):
            _, predicted_acts = torch.max(logits, 1)
        
            x.extend(sents)
            y_true.extend(acts.tolist()[0])
            y_pred.extend(predicted_acts.tolist())

            total += len(acts.tolist()[0])
            correct += (predicted_acts == acts).sum().item()

            return x, y_true, y_pred, total, correct

        progress_bar = tqdm.tqdm

        with torch.no_grad():
            pbar = progress_bar(dataloader)
            dataloader_TF_list = list(dataloader_TF)
            correct = 0
            total = 0
            x_A, y_true_A, y_pred_A, correct_A, total_A = [], [], [], 0, 0
            x_B, y_true_B, y_pred_B, correct_B, total_B = [], [], [], 0, 0
            x_TF, y_true_TF, y_pred_TF, correct_TF, total_TF = [], [], [], 0, 0

            for i, batch in enumerate(pbar):
                batch = batch[0]
                batch_TF = dataloader_TF_list[i%len(dataloader_TF_list)]
                # if sum([len(item) for item in batch[1]]) > 1024:
                #     continue

                sents_A, logits_A, acts_A,\
                sents_B, logits_B, acts_B,\
                sents_TF, logits_TF, acts_TF = self.train_one_iter(batch, batch_TF, None, which_to_train, fp16=False, 
                                                   is_validation=True)
                
                x_A, y_true_A, y_pred_A, total_A, correct_A = get_numbers_for_one_task(sents_A, logits_A, acts_A,\
                                                                                       x_A, y_true_A, y_pred_A, total_A, correct_A)
                x_B, y_true_B, y_pred_B, total_B, correct_B = get_numbers_for_one_task(sents_B, logits_B, acts_B,\
                                                                                       x_B, y_true_B, y_pred_B, total_B, correct_B)
                x_TF, y_true_TF, y_pred_TF, total_TF, correct_TF = get_numbers_for_one_task(sents_TF, logits_TF, acts_TF,\
                                                                                       x_TF, y_true_TF, y_pred_TF, total_TF, correct_TF)

            f1_A = f1_score(y_true_A, y_pred_A, average="weighted")
            f1_B = f1_score(y_true_B, y_pred_B, average="weighted")
            f1_TF = f1_score(y_true_TF, y_pred_TF, average="binary")
            # pdb.set_trace()
            
            pd.DataFrame(zip(x_A, self.le_A.inverse_transform(y_true_A).tolist(), self.le_A.inverse_transform(y_pred_A).tolist()),
                        columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/A/act_classifier_val_results_epoch{ep}.csv", index=None)
            print(f"A: Epoch {ep} Validation accuracy: {correct_A/total_A}, f1: {f1_A}")
            
            pd.DataFrame(zip(x_B, self.le_B.inverse_transform(y_true_B).tolist(), self.le_B.inverse_transform(y_pred_B).tolist()),
                        columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/B/act_classifier_val_results_epoch{ep}.csv", index=None)
            print(f"B: Epoch {ep} Validation accuracy: {correct_B/total_B}, f1: {f1_B}")
            
            pd.DataFrame(zip(x_TF, y_true_TF, y_pred_TF),
                        columns=['sent', 'y_true', 'y_pred']).to_csv(f"Checkpoint_act_clf/TF/act_classifier_val_results_epoch{ep}.csv", index=None)
            print(f"TF: Epoch {ep} Validation accuracy: {correct_TF/total_TF}, f1: {f1_TF}")
            # print_cm(confusion_matrix(y_true, y_pred, labels=range(len(le.classes_))), labels=[l[:] for l in le.classes_.tolist()])
            return (correct_A/total_A, f1_A), (correct_B/total_B, f1_B), (correct_TF/total_TF, f1_TF)

    def set_past(self, sent, which_task):
        "sent: str, a whole sent"
        # assert sent.startswith("A:") or sent.startswith("B:")
        if sent.startswith("A:") or sent.startswith("B:"):
            pdb.set_trace()
            sent = sent[2:]

        if which_task == "A":
            lm_model = self.model_A
            prefix = "A:"
            device = lm_model.device
        elif which_task == "B":
            lm_model = self.model_B
            prefix = "B:"
            device = lm_model.device
        elif which_task == "TF":
            lm_model = self.model_A
            prefix = "A:"
            # candidate_sent = prefix+" ".join(separate_sents)
            device = lm_model.device
        
        # encode sent
        self.history.append(prefix+sent)
        sent = self.tokenizer.encode(prefix) + self.tokenizer.encode(sent) + self.train_dataset.turn_ending
        sent = torch.LongTensor(sent).unsqueeze(0).to(device)

        past = self.move_to_device(self.past, lm_model)
        _, past, _ = lm_model(sent, past) 
        
        self.past = past
        

    def predict(self, separate_sents, which_task):
        "separate_sents: list of sentences with no prefix"
        past = self.past
        
        if which_task == "A":
            lm_model = self.model_A
            clf_head = self.clf_A
            le = self.le_A
            prefix = "A:"
            device = lm_model.device
        elif which_task == "B":
            lm_model = self.model_B
            clf_head = self.clf_B
            le = self.le_B
            prefix = "B:"
            device = lm_model.device
        elif which_task == "TF":
            lm_model = self.model_A
            clf_head = self.clf_TF
            prefix = "A:"
            candidate_sent = " ".join(separate_sents)
            device = lm_model.device
        
        # evaluation mode
        self.model_A.eval()
        self.model_B.eval()
        self.clf_A.eval()
        self.clf_B.eval()
        self.clf_TF.eval()

        with torch.no_grad():
            if which_task in ["A", "B"]:
                all_logits = []
                for i, sent in enumerate(separate_sents):
                    if i == 0:
                        sent = self.tokenizer.encode(prefix) + self.tokenizer.encode(sent)
                    else:
                        sent = self.tokenizer.encode(" "+sent)

                    # pdb.set_trace()
                    sent = torch.LongTensor(sent).unsqueeze(0).to(device)
                    past = self.move_to_device(past, lm_model)
                    logits, past, hidden_states = lm_model(sent, past)
                    
                    # encode [CLS]
                    cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(device)
                    _, _, hidden_states = lm_model(cls_token_tensor, past)
                    hidden_states = self.move_to_device(hidden_states, clf_head)
                    mc_logits = clf_head(hidden_states[-1], cls_index=None).squeeze(-1)
                    
                    all_logits.append(mc_logits)

                # finish tail
                end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(device)
                past = self.move_to_device(past, lm_model)
                _, past, _ = lm_model(end_input, past) 

                # get labels
                all_logits = torch.cat(all_logits, dim=0)
                # pdb.set_trace()
                _, predicted_acts = torch.max(all_logits, 1)
                predicted_acts = predicted_acts.tolist()
                predicted_acts = le.inverse_transform(predicted_acts).tolist()

                return predicted_acts, past
            elif which_task == "TF":
                # encode candidate
                candidate = self.tokenizer.encode(prefix) + self.tokenizer.encode(candidate_sent)
                # pdb.set_trace()
                candidate = torch.LongTensor(candidate).unsqueeze(0).to(device)
                past = self.move_to_device(past, self.model_A)
                logits, past, hidden_states = self.model_A(candidate, past)
                # encode [CLS]
                cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(device)
                _, _, hidden_states = self.model_A(cls_token_tensor, past)
                hidden_states = self.move_to_device(hidden_states, self.clf_TF)
                mc_logits = self.clf_TF(hidden_states[-1], cls_index=None).squeeze(-1)
                # pdb.set_trace()
                _, predicted_acts = torch.max(mc_logits, 1)
                predicted_acts = predicted_acts.tolist()
                assert len(predicted_acts) == 1
                return predicted_acts[0], past

    def train_one_iter(self, batch, batch_TF, update_count, which_to_train, fp16=False, is_validation=False):
        # role_ids, whole_sents, separate_sents, acts = batch
        past = None
        all_sents_A, all_logits_A, all_acts_A = [], [], []
        all_sents_B, all_logits_B, all_acts_B = [], [], []
        for i, (role_id, whole_sent, separate_sents, acts) in enumerate(zip(*batch)):            
            if role_id == 0:
                whole_sent = torch.LongTensor(whole_sent).unsqueeze(0).to(self.device1)
                try:
                    assert self.tokenizer.decode(whole_sent[0][:2].tolist()) == "A:"
                except:
                    pdb.set_trace()
                if "A" in which_to_train:
                    past = self.move_to_device(past, self.model_A)
                    _, real_past, _ = self.model_A(whole_sent, past)
                    for act, sent in zip(acts, separate_sents):
                        all_sents_A.append(self.tokenizer.decode(sent))
                        # pdb.set_trace()
                        # 'A:HI I would like to tell you About a childrens charity called Save the CHildren.'
                        sent = torch.LongTensor(sent).unsqueeze(0).to(self.device1)
                        past = self.move_to_device(past, self.model_A)
                        logits, past, hidden_states = self.model_A(sent, past)
                        
                        # pdb.set_trace()
                        # encode [CLS]
                        cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device1)
                        past = self.move_to_device(past, self.model_A)
                        _, _, hidden_states = self.model_A(cls_token_tensor, past)
                        
                        mc_logits = self.clf_A(hidden_states[-1], cls_index=None).squeeze(-1)
                        all_logits_A.append(mc_logits)
                        all_acts_A.append(act)
                    # pdb.set_trace()
                    past = real_past
                    # # finish tail
                    # end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(self.device1)
                    # _, past, _ = self.model_A(end_input, past) 
                else:
                    past = self.move_to_device(past, self.model_A)
                    _, past, hidden_states = self.model_A(whole_sent, past)
            else:
                whole_sent = torch.LongTensor(whole_sent).unsqueeze(0).to(self.device2)
                try:
                    assert self.tokenizer.decode(whole_sent[0][:2].tolist()) == "B:"
                except:
                    pdb.set_trace()
                if "B" in which_to_train:
                    past = self.move_to_device(past, self.model_B)
                    _, real_past, _ = self.model_B(whole_sent, past)
                    for act, sent in zip(acts, separate_sents):
                        all_sents_B.append(self.tokenizer.decode(sent))
                        # pdb.set_trace()
                        #'B:ok please do'
                        sent = torch.LongTensor(sent).unsqueeze(0).to(self.device2)
                        past = self.move_to_device(past, self.model_B)
                        logits, past, hidden_states = self.model_B(sent, past)
                        
                        # encode [CLS]
                        cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device2)
                        _, _, hidden_states = self.model_B(cls_token_tensor, past)
                        
                        hidden_states = self.move_to_device(hidden_states, self.clf_B)
                        mc_logits = self.clf_B(hidden_states[-1], cls_index=None).squeeze(-1)
                        all_logits_B.append(mc_logits)
                        all_acts_B.append(act)
                    past = real_past 
                    # finish tail
                    # end_input = torch.LongTensor(self.train_dataset.turn_ending).unsqueeze(0).to(self.device2)
                    # past = self.move_to_device(past, self.model_B)
                    # _, past, _ = self.model_B(end_input, past) 
                else:
                    past = self.move_to_device(past, self.model_B)
                    _, past, hidden_states = self.model_B(whole_sent, past)
        
        all_logits_A = torch.cat(all_logits_A, dim=0)
        all_acts_A = torch.tensor(all_acts_A).unsqueeze(0).to(self.device1)
        # pdb.set_trace()
        loss_A = self.criterion(all_logits_A.view(-1, all_logits_A.size(-1)), all_acts_A.view(-1))

        all_logits_B = torch.cat(all_logits_B, dim=0)
        all_acts_B = torch.tensor(all_acts_B).unsqueeze(0).to(self.device1)

        loss_B = self.criterion(all_logits_B.view(-1, all_logits_B.size(-1)), all_acts_B.view(-1))
        
        # TF task
        all_contexts_candidate_TF = []
        all_logits_TF = []
        all_acts_TF = []
        for one_dial in batch_TF:
            past = None
            contexts, candidate, pick_or_not = one_dial
            all_contexts_candidate_TF.append((" ".join([self.tokenizer.decode(c) for c in contexts]), 
                                              self.tokenizer.decode(candidate)))
            
            # get past
            for i, context in enumerate(contexts):
                if i%2 == 0:
                    # pdb.set_trace()
                    #'A:Would you like to know more about the charity Save the Children?\n\n\n'
                    context = torch.LongTensor(context).unsqueeze(0).to(self.device1)
                    past = self.move_to_device(past, self.model_A)
                    logits, past, hidden_states = self.model_A(context, past)
                else:
                    # pdb.set_trace()
                    #'B:hello I am great.\n\n\n'
                    context = torch.LongTensor(context).unsqueeze(0).to(self.device2)
                    past = self.move_to_device(past, self.model_B)
                    logits, past, hidden_states = self.model_B(context, past)
            
            # encode candidate
            # pdb.set_trace()
            # "A:Save the Children is an international non-governmental organization that promotes children's rights, provides relief and helps support children in developing countries."
            candidate = torch.LongTensor(candidate).unsqueeze(0).to(self.device1)
            past = self.move_to_device(past, self.model_A)
            logits, past, hidden_states = self.model_A(candidate, past)
            # encode [CLS]
            cls_token_tensor = torch.LongTensor([self.cls_token_id]).unsqueeze(0).to(self.device1)
            _, _, hidden_states = self.model_A(cls_token_tensor, past)
            
            mc_logits = self.clf_TF(hidden_states[-1], cls_index=None).squeeze(-1)
            all_logits_TF.append(mc_logits)
            all_acts_TF.append(pick_or_not)

        all_logits_TF = torch.cat(all_logits_TF, dim=0)
        all_acts_TF = torch.tensor(all_acts_TF).unsqueeze(0).to(self.device1)

        loss_TF = self.criterion(all_logits_TF.view(-1, all_logits_TF.size(-1)), all_acts_TF.view(-1))
        
        if is_validation:
            return all_sents_A, all_logits_A, all_acts_A,\
                   all_sents_B, all_logits_B, all_acts_B,\
                   all_contexts_candidate_TF, all_logits_TF, all_acts_TF

        loss = loss_A.to(self.device1) + loss_B.to(self.device1) + loss_TF.to(self.device1)

        loss /= self.num_gradients_accumulation
        
        if fp16:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        
        record_loss = loss.item() * self.num_gradients_accumulation
        
        return record_loss#, perplexity

    def move_to_device(self, past, target):
        if past is not None and target.device != past[0].device:
            past = [p.to(target.device) for p in past]
        return past
def train():
    config_file = "configs/train_full_pipeline_config.json"
    config = Config.from_json_file(config_file)

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", config.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(config))

    # Initialize distributed training if needed
    config.distributed = (config.local_rank != -1)
    if config.distributed:
        torch.cuda.set_device(config.local_rank)
        config.device = torch.device("cuda", config.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )
    tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
    model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadLMEmotionRecognitionModel
    model = model_class.from_pretrained(config.model_checkpoint)
    tokenizer.set_special_tokens(SPECIAL_TOKENS)
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.to(config.device)
    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if config.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=config.fp16)
    if config.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[config.local_rank],
                                        output_device=config.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        config, tokenizer)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    model.eval()
    num_correct = 0
    num_all = len(val_loader)
    for batch in val_loader:
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(config.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids = batch

            model_outputs = model(input_ids,
                                  mc_token_ids,
                                  token_type_ids=token_type_ids,
                                  token_emotion_ids=token_emotion_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[
                1]  # So we can also use GPT2 outputs

            indices = torch.argmax(mc_logits, dim=1)

            correct = torch.eq(indices, mc_labels).view(-1)
            num_correct += torch.sum(correct).item()

    print(num_correct / num_all)
Example #20
0
def train(
    *,
    model,
    criterion,
    x_train,
    y_train,
    epochs,
    yield_steps,
    bucket,
    lr,
    batch_size: int,
    accumulation_steps: int,
    pad_idx: int,
):
    train_dataset = TensorDataset(torch.tensor(x_train, dtype=torch.long),
                                  torch.tensor(y_train, dtype=torch.float))

    model.zero_grad()
    model = model.to(device)
    param_optimizer = list(model.named_parameters())

    num_train_optimization_steps = int(epochs * len(train_dataset) /
                                       (batch_size * accumulation_steps))
    if isinstance(model, BertForSequenceClassification):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {
                'params': [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.01
            },
            {
                'params': [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                'weight_decay':
                0.0
            },
        ]
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=lr,
                             warmup=0.05,
                             t_total=num_train_optimization_steps)
    elif isinstance(model, GPT2ClassificationHeadModel):
        optimizer = OpenAIAdam([p for _, p in param_optimizer],
                               lr=lr,
                               warmup=0.1,
                               t_total=num_train_optimization_steps)
    else:
        raise ValueError

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level='O1',
                                      verbosity=0)
    model.train()

    if bucket:
        sampler = RandomSampler(train_dataset)
        batch_sampler = BucketBatchSampler(sampler,
                                           batch_size,
                                           drop_last=False,
                                           pad_idx=pad_idx)
        train_loader = DataLoader(train_dataset, batch_sampler=batch_sampler)
    else:
        train_loader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)

    smoothed_loss = None
    step = 0
    epoch_pbar = tqdm.trange(epochs)

    def _state():
        return model, optimizer, epoch_pbar, smoothed_loss, step * batch_size

    print(f'Starting training for '
          f'{num_train_optimization_steps * accumulation_steps:,} steps, '
          f'checkpoint interval {yield_steps:,}')

    yield _state()

    torch.cuda.empty_cache()
    for _ in epoch_pbar:
        optimizer.zero_grad()
        pbar = tqdm.tqdm(train_loader, leave=False)
        for x_batch, y_batch in pbar:
            step += 1
            if bucket:
                x_batch, y_batch = trim_tensors([x_batch, y_batch], pad_idx)
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            try:
                y_pred = model(x_batch,
                               attention_mask=x_batch > 0,
                               labels=None)
                loss = criterion(y_pred, y_batch)
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                if step % accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
            except RuntimeError as e:
                if 'CUDA out of memory' in str(e):
                    print('ignoring', e)
                    torch.cuda.empty_cache()
                    continue
                raise

            if smoothed_loss is not None:
                smoothed_loss = 0.98 * smoothed_loss + 0.02 * loss.item()
            else:
                smoothed_loss = loss.item()
            pbar.set_postfix(loss=f'{smoothed_loss:.4f}')

            if step % yield_steps == 0:
                yield _state()

        yield _state()
        torch.cuda.empty_cache()
def train():
    config_file = "configs/train_daily_dialog_emotion_detection_config.json"
    config = Config.from_json_file(config_file)

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", config.local_rank)
    logger.info("Arguments: %s", pformat(config))

    # Initialize distributed training if needed
    config.distributed = (config.local_rank != -1)
    if config.distributed:
        torch.cuda.set_device(config.local_rank)
        config.device = torch.device("cuda", config.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )
    tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
    model_class = OpenAIGPTForEmotionDetection
    model = model_class.from_pretrained(config.model_checkpoint)
    tokenizer.set_special_tokens(SPECIAL_TOKENS)
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.to(config.device)
    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if config.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=config.fp16)
    if config.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[config.local_rank],
                                        output_device=config.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        config, tokenizer)

    model.eval()
    n_emotions = 0
    num_correct = 0
    positives = 0
    all_true_positives = 0
    num_all = len(val_loader)
    for batch in val_loader:
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(config.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            model_outputs = model(input_ids,
                                  mc_token_ids,
                                  token_type_ids=token_type_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[
                1]  # So we can also use GPT2 outputs
            indices = torch.argmax(mc_logits, dim=1)

            correct = torch.eq(indices, mc_labels).view(-1)
            num_correct += torch.sum(correct).item()

            num_classes = mc_logits.size(1)
            mc_labels = to_onehot(mc_labels.view(-1), num_classes=num_classes)
            indices = torch.argmax(mc_logits, dim=1).view(-1)
            mc_logits = to_onehot(indices, num_classes=num_classes)
            mc_labels = mc_labels.type_as(mc_logits)
            correct = mc_labels * mc_logits
            all_positives = mc_logits.sum(dim=0).type(
                torch.DoubleTensor)  # Convert from int cuda/cpu to double cpu

            if correct.sum() == 0:
                true_positives = torch.zeros_like(all_positives)
            else:
                true_positives = correct.sum(dim=0)

            true_positives = true_positives.type(torch.DoubleTensor)
            positives += all_positives
            all_true_positives += true_positives

    print(num_correct / num_all)
    print(all_true_positives / positives)
    print(n_emotions)
Example #22
0
def train_model(epochs=10,
                num_gradients_accumulation=4,
                batch_size=8,
                gpu_id=0,
                lr=1e-4,
                load_dir='decoder_model',
                decoder_model='original_pretrained_model_for_bertGPT.pth'):
    # make sure your model is on GPU
    device = torch.device(f"cuda:{gpu_id}")

    #------------------------LOAD MODEL-----------------
    print('load the model....')
    model = BertGPT()
    model.load_state_dict(torch.load(decoder_model))
    # model = nn.DataParallel(model, device_ids = [0])
    model = model.to(device)
    print('load success')
    #------------------------END LOAD MODEL--------------

    #------------------------LOAD TRAIN DATA------------------
    train_data = torch.load("train_data.pth")
    train_dataset = MyDataset(*train_data)
    train_dataloader = DataLoader(dataset=train_dataset,
                                  shuffle=True,
                                  batch_size=batch_size,
                                  num_workers=2,
                                  collate_fn=collate_fn)
    val_data = torch.load("validate_data.pth")
    val_dataset = MyDataset(*val_data)
    val_dataloader = DataLoader(dataset=val_dataset,
                                shuffle=True,
                                batch_size=batch_size,
                                num_workers=2,
                                collate_fn=collate_fn)
    #------------------------END LOAD TRAIN DATA--------------

    #------------------------SET OPTIMIZER-------------------
    num_train_optimization_steps = len(
        train_dataset) * epochs // batch_size // num_gradients_accumulation

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in param_optimizer
            if not any(nd in n for nd in no_decay) and p.requires_grad
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in param_optimizer
            if any(nd in n for nd in no_decay) and p.requires_grad
        ],
        'weight_decay':
        0.0
    }]
    print('train')
    print(len(optimizer_grouped_parameters[0]['params']))

    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=lr,
                           warmup=0.01,
                           max_grad_norm=1.0,
                           weight_decay=0.01,
                           t_total=num_train_optimization_steps)
    #------------------------END SET OPTIMIZER--------------

    #------------------------START TRAINING-------------------
    update_count = 0

    start = time.time()
    print('start training....')
    for epoch in range(epochs):
        #------------------------training------------------------
        model.train()
        losses = 0
        times = 0
        for batch in tqdm(train_dataloader, desc='dirs'):
            batch = [item.to(device) for item in batch]

            encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

            logits = model(encoder_input, mask_encoder_input, decoder_input,
                           mask_decoder_input)

            out = logits[:, :-1].contiguous()
            target = decoder_input[:, 1:].contiguous()
            target_mask = mask_decoder_input[:, 1:].contiguous()
            loss = util.sequence_cross_entropy_with_logits(out,
                                                           target,
                                                           target_mask,
                                                           average="token")
            loss.backward()

            losses += loss.item()
            times += 1

            update_count += 1

            if update_count % num_gradients_accumulation == num_gradients_accumulation - 1:
                optimizer.step()
                optimizer.zero_grad()
        end = time.time()
        print('-' * 20 + f'epoch {epoch}' + '-' * 20)
        print(f'time: {(end - start)}')
        print(f'loss: {losses / times}')
        start = end

        #------------------------validate------------------------
        model.eval()

        perplexity = 0
        batch_count = 0
        print('start calculate the perplexity....')

        with torch.no_grad():
            for batch in tqdm(val_dataloader):
                batch = [item.to(device) for item in batch]
                encoder_input, decoder_input, mask_encoder_input, mask_decoder_input = batch

                logits = model(encoder_input, mask_encoder_input,
                               decoder_input, mask_decoder_input)

                out = logits[:, :-1].contiguous()
                target = decoder_input[:, 1:].contiguous()
                target_mask = mask_decoder_input[:, 1:].contiguous()

                loss = util.sequence_cross_entropy_with_logits(out,
                                                               target,
                                                               target_mask,
                                                               average="token")
                perplexity += np.exp(loss.item())
                batch_count += 1

        print(f'validate perplexity: {perplexity / batch_count}')

        direct_path = os.path.join(os.path.abspath('.'), load_dir)
        if not os.path.exists(direct_path):
            os.mkdir(direct_path)

        torch.save(model.state_dict(),
                   os.path.join(direct_path,
                                str(epoch) + "model.pth"))
Example #23
0
def main():
    # Parse the arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=1)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)
    parser.add_argument('--max_seq_length', type=int, default=110)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Set the seed for random, numpy, PyTorch
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned
    special_tokens = ['<POS>', '<NEG>', '<CON_START>', '<START>', '<END>']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    start_token_id = tokenizer.convert_tokens_to_ids(['<START>'])[0]
    model = OpenAIGPTLMHeadModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Load and encode dataset
    def tokenize_and_encode(file_path):
        '''
        This method tokenizes the input data and encodes it using the OpenAIGPTTokenizer
        :param file_path: Path of the input file, dtype: str
        :return: encoded dataset  dtype: list
        '''
        with open(file_path, 'r') as in_fp:
            lines = in_fp.read().splitlines()

        tokenized_dataset = lines
        for i, line in enumerate(tqdm(lines)):
            token = tokenizer.tokenize(line)[:512]
            tokenized_dataset[i] = tokenizer.convert_tokens_to_ids(token)
        return tokenized_dataset

    logger.info("Encoding dataset...")
    train_dataset = tokenize_and_encode(args.train_dataset)
    eval_dataset = tokenize_and_encode(args.eval_dataset)
    print("Training samples = {}".format(len(train_dataset)))
    print("Validation samples = {}".format(len(eval_dataset)))
    print("Example = {}".format(train_dataset[0]))
    time.sleep(2)
    # Compute the mex input length for the Transformer
    train_dataset = [
        x for x in train_dataset
        if len(x) <= args.max_seq_length and start_token_id in x
    ]  # Remove all sentence longer than max_seq_length
    eval_dataset = [
        x for x in eval_dataset
        if len(x) <= args.max_seq_length and start_token_id in x
    ]
    input_length = max(max(len(t) for t in train_dataset),
                       max(len(q) for q in eval_dataset))
    if n_gpu > 1:
        input_length = min(input_length, model.module.config.n_positions)
    else:
        input_length = min(input_length, model.config.n_positions
                           )  # Max size of input for the pre-trained model
    print("Input Length = {}".format(input_length))

    def pre_process_dataset(encoded_dataset, input_length, start_token_id):
        """
        This method is to create torch tensor of input ids and lm labels
        :param encoded_dataset: Input dataset, dtype: list
        :param input_length: Maximum length of sentence from training and eval dataset, dtype: int
        :param start_token_id: id of the '<START>' token, dtype: int
        :return: torch.tensor of size [len(encoded_dataset), 2]
        """

        n_batch = len(encoded_dataset)
        input_ids = np.zeros(shape=(n_batch, input_length), dtype=np.int64)
        lm_labels = np.full(shape=(n_batch, input_length),
                            fill_value=-1,
                            dtype=np.int64)

        for i, tokens in enumerate(encoded_dataset):
            try:
                #tokens = tokens[:input_length]
                start_id_index = tokens.index(start_token_id)
                input_ids[i, :len(tokens)] = tokens
                start_id_index = tokens.index(start_token_id)
                lm_labels[i, start_id_index:len(tokens) -
                          1] = tokens[start_id_index + 1:len(tokens)]
                # LM loss calculate only for tokens after <START> token in the sentence
                #lm_labels[i, :len(tokens)-1] = tokens[1:]
            except ValueError:
                print("Index {} doesn't have start token".format(i))

        input_ids = torch.tensor(input_ids)
        lm_labels = torch.tensor(lm_labels)
        tensor_dataset = (input_ids, lm_labels)
        #tensor_dataset.append(torch.tensor(d) for d in all_inputs)

        return tensor_dataset

    # Prepare input tensors and dataloders
    train_tensor_dataset = pre_process_dataset(train_dataset,
                                               input_length,
                                               start_token_id=start_token_id)
    eval_tensor_dataset = pre_process_dataset(eval_dataset,
                                              input_length,
                                              start_token_id=start_token_id)

    print("Training Example Input ids= {}".format(train_tensor_dataset[0][0]))
    print("Training Example Language Modeling ids = {}".format(
        train_tensor_dataset[1][0]))
    time.sleep(10)
    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = RandomSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(
        train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, lm_labels = batch
                loss = model(input_ids, lm_labels=lm_labels)
                if n_gpu > 1:
                    loss.mean().backward()
                else:
                    loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                if n_gpu > 1:
                    tmp_loss = loss.mean().item()
                else:
                    tmp_loss = loss.item()
                exp_average_loss = tmp_loss if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * tmp_loss
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

            model_to_save = model.module if hasattr(
                model, 'module') else model  # Only save the model it-self
            output_model_file = os.path.join(
                args.output_dir,
                "pytorch_model_zero_grad_{}.bin".format(epoch + 1))
            config = model.module.config if hasattr(model,
                                                    'module') else model.config
            torch.save(model_to_save.state_dict(), output_model_file)

            model_state_dict = torch.load(output_model_file)
            model = OpenAIGPTLMHeadModel(config)
            model.load_state_dict(model_state_dict)
            model.to(device)
            if n_gpu > 1:
                model = torch.nn.DataParallel(model)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, lm_labels = batch
            with torch.no_grad():
                lm_loss = model(input_ids, lm_labels=lm_labels)

            eval_loss += lm_loss.mean().item()

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss, 'train_loss': train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Example #24
0
def run_model():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--model_name',
        type=str,
        default='openai-gpt',
        help='pretrained model name or path to local checkpoint')
    parser.add_argument('--setting', type=str, default='explain_predict')
    parser.add_argument('--eval_preds_prefix', type=str, default='preds_')
    parser.add_argument("--n_train_print", type=int, default=10)
    parser.add_argument("--n_gen", type=int, default=20)
    parser.add_argument("--batch_size", type=int, default=-1)
    parser.add_argument("--length", type=int, default=-1)
    parser.add_argument("--temperature", type=int, default=1)
    parser.add_argument("--top_k", type=int, default=0)
    parser.add_argument('--unconditional',
                        action='store_true',
                        help='If true, unconditional generation.')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument("--do_eval_train",
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=10)
    parser.add_argument('--num_eval_print', type=int, default=15)
    parser.add_argument('--train_batch_size', type=int, default=36)
    parser.add_argument('--eval_batch_size', type=int, default=60)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=1e-6)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--data',
                        type=str,
                        default='/stage/examples/commonsenseqa/')

    args = parser.parse_args()
    print(args)

    if args.batch_size == -1:
        args.batch_size = 1

    np.random.seed(args.seed)
    torch.random.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval and not args.do_test:
        raise ValueError(
            "At least one of `do_train` or `do_eval`  or do_test must be True."
        )

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    special_tokens = [
        '_start_</w>', 'or</w>', '_answer_</w>', '_classify_</w>', '_end_</w>'
    ]
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTLMHeadModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)

    datasets = parse_cqa(args.data, args.setting)
    numericalized = [
        CommonsenseExample.numericalize_list(
            CommonsenseExample.tokenize_list(d, tokenizer), tokenizer)
        for d in datasets
    ]

    tensor_datasets = pre_process_datasets(numericalized, *special_tokens_ids)

    #    train_tensor_dataset, eval_tensor_dataset, test_tensor_dataset = tensor_datasets[0], tensor_datasets[1], tensor_datasets[2]
    train_sampler, train_data = None, None
    if args.do_train or args.do_eval_train:
        train_tensor_dataset = tensor_datasets[0]
        train_data = TensorDataset(*train_tensor_dataset)
        train_sampler = RandomSampler(train_data)
        if args.do_eval_train:
            train_sampler = SequentialSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

    if args.do_eval:
        if args.do_eval_train:
            eval_data = train_data
            eval_sampler = train_sampler
        else:
            eval_tensor_dataset = tensor_datasets[1]
            eval_data = TensorDataset(*eval_tensor_dataset)
            eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

    if args.do_test:
        test_tensor_dataset = tensor_datasets[-1]
        test_data = TensorDataset(*test_tensor_dataset)
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = len(
            train_data) * args.num_train_epochs // args.train_batch_size
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

    def trim_unks(x):
        try:
            unk_id = x.index('_end_</w>')
            return x[:unk_id]
        except:
            return x

    def detokenize(x):
        y = ''.join(trim_unks(x))
        y = y.replace('</w>', ' ')
        y = y.replace(' .', '.')
        y = y.replace(' ,', ',')
        y = y.replace(' ?', '?')
        y = y.replace(' !', '!')
        y = y.replace(' \' ', '\'')
        y = y.replace(' \'re', '\'re')
        y = y.replace(' \'s', '\'s')
        y = y.replace(' n\'t', 'n\'t')
        return y

    def detok_batch(x):
        if not isinstance(x, list):
            x = x.tolist()
        return [
            detokenize(
                tokenizer.convert_ids_to_tokens([z for z in y if z >= 0]))
            for y in x
        ]

    if args.do_train:
        best_eval = 0
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in range(int(args.num_train_epochs)):
            tr_loss, train_ppl, n_train_examples = 0, 0, 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            train_pred_strs, train_lab_strs = [], []
            for step, batch in enumerate(tqdm_bar):
                inputs = batch[0].to(device)
                labels = batch[1].to(device)
                loss = model(inputs, lm_labels=labels)
                train_ppl += loss.item() * inputs.size(0)
                n_train_examples += inputs.size(0)
                loss.backward()
                optimizer.step()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                if args.n_train_print > 0:
                    with torch.no_grad():
                        preds = sample(model, batch[2], 10, device)

                    pred_str = detok_batch(preds)
                    label_str = detok_batch(labels)
                    train_lab_strs.extend(label_str)
                    train_pred_strs.extend(pred_str)
                    input_str = detok_batch(inputs)
                    for print_idx in range(
                            min(args.n_train_print, inputs.size(0))):
                        print('INPT: ', input_str[print_idx])
                        print('GOLD: ', label_str[print_idx])
                        print('PRED: ', pred_str[print_idx])
                        print()

            train_bleu = None
            if args.n_train_print > 0:
                train_bleu = computeBLEU(train_pred_strs,
                                         [[x] for x in train_lab_strs])
                train_ppl = math.exp(train_ppl / n_train_examples)

            if args.do_eval:
                model.eval()
                eval_loss, eval_em, eval_ppl = 0, 0, 0
                nb_eval_steps, nb_eval_examples = 0, 0
                label_strs, prediction_strs = [], []
                n_words = 0
                for batch in eval_dataloader:
                    inputs = batch[0].to(device)
                    labels = batch[1].to(device)

                    with torch.no_grad():
                        loss = model(inputs, lm_labels=labels)
                        preds = sample(model, batch[2], args.n_gen, device)

                    eval_loss += loss.item()
                    eval_ppl += loss.item() * inputs.size(0)
                    nb_eval_examples += inputs.size(0)
                    nb_eval_steps += 1
                    pred_str = detok_batch(preds)
                    label_str = detok_batch(labels)
                    label_strs.extend(label_str)
                    prediction_strs.extend(pred_str)
                    input_str = detok_batch(inputs)
                    eval_em += sum(
                        [x == y for x, y in zip(pred_str, label_str)])
                    for print_idx in range(
                            min(inputs.size(0), args.num_eval_print)):
                        print('INPT: ', input_str[print_idx])
                        print('GOLD: ', label_str[print_idx])
                        print('PRED: ', pred_str[print_idx])
                        print()

                eval_bleu = computeBLEU(prediction_strs,
                                        [[x] for x in label_strs])
                eval_ppl = math.exp(eval_ppl / nb_eval_examples)
                eval_em = eval_em / nb_eval_examples
                eval_loss = eval_loss / nb_eval_steps
                train_loss = tr_loss / nb_tr_steps if args.do_train else None
                result = {
                    'eval_loss': eval_loss,
                    'eval_em': eval_em,
                    'eval_bleu': eval_bleu,
                    'eval_ppl': eval_ppl,
                    'train_loss': train_loss,
                    'train_bleu': train_bleu,
                    'train_ppl': train_ppl
                }

                output_eval_file = os.path.join(args.output_dir,
                                                "eval_results.txt")
                with open(output_eval_file, "a") as writer:
                    for key in sorted(result.keys()):
                        logger.info("  %s = %s", key, str(result[key]))
                        writer.write("%s = %s\n" % (key, str(result[key])))

                if eval_bleu > best_eval:
                    best_eval = eval_bleu

                    # Save a trained model
                    model_to_save = model.module if hasattr(
                        model,
                        'module') else model  # Only save the model it-self
                    output_model_file = os.path.join(args.output_dir,
                                                     "pytorch_model.bin")
                    config = model.config
                    torch.save(model_to_save.state_dict(), output_model_file)

    if args.do_eval:
        # Load a trained model that you have fine-tuned
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTLMHeadModel(model.config)
        model.load_state_dict(model_state_dict)
        # uncomment to try out the default not finue-tuned model
        #        model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens), cache_dir=os.path.dirname(args.data))
        model.to(device)
        model.eval()
        eval_loss, eval_em, eval_ppl = 0, 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        label_strs, prediction_strs = [], []
        n_words = 0
        for batch in eval_dataloader:
            inputs = batch[0].to(device)
            labels = batch[1].to(device)

            with torch.no_grad():
                loss = model(inputs, lm_labels=labels)
                preds = sample(model, batch[2], args.n_gen, device)

            eval_loss += loss.item()
            eval_ppl += loss.item() * inputs.size(0)
            nb_eval_examples += inputs.size(0)
            nb_eval_steps += 1
            pred_str = detok_batch(preds)
            label_str = detok_batch(labels)
            label_strs.extend(label_str)
            prediction_strs.extend(pred_str)
            input_str = detok_batch(inputs)
            eval_em += sum([x == y for x, y in zip(pred_str, label_str)])
            for print_idx in range(min(inputs.size(0), args.num_eval_print)):
                print('INPT: ', input_str[print_idx])
                print('GOLD: ', label_str[print_idx])
                print('PRED: ', pred_str[print_idx])
                print()

        eval_bleu = computeBLEU(prediction_strs, [[x] for x in label_strs])
        eval_ppl = math.exp(eval_ppl / nb_eval_examples)
        eval_em = eval_em / nb_eval_examples
        eval_loss = eval_loss / nb_eval_steps
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_em': eval_em,
            'eval_bleu': eval_bleu,
            'eval_ppl': eval_ppl,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Best Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        output_preds_file = os.path.join(
            args.output_dir, f"{args.eval_preds_prefix}_{args.setting}.txt")
        with open(output_preds_file, 'w') as writer:
            logger.info("Writing predictions")
            for p in prediction_strs:
                writer.write(p + '\n')

    if args.do_test:
        # Load a trained model that you have fine-tuned
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTLMHeadModel(model.config)
        model.load_state_dict(model_state_dict)
        model.to(device)
        model.eval()
        eval_loss, eval_em, eval_ppl = 0, 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        label_strs, prediction_strs = [], []
        n_words = 0
        for batch in test_dataloader:
            inputs = batch[0].to(device)

            with torch.no_grad():
                preds = sample(model, batch[1], args.n_gen, device)

            pred_str = detok_batch(preds)
            prediction_strs.extend(pred_str)

        output_preds_file = os.path.join(
            args.output_dir, f"{args.test_preds_prefix}_{args.setting}.txt")
        with open(output_preds_file, 'w') as writer:
            logger.info("Writing predictions")
            for p in prediction_strs:
                writer.write(f'"{p.strip()}"\n')
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', type=str, default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--output_dir", default='tuned_gpt2', type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--train_dataset', type=str, default='')
    
    parser.add_argument('--source_eval', type=str, default='')
    parser.add_argument('--target_eval', type=str, default='')
    parser.add_argument('--source_train', type=str, default='')
    parser.add_argument('--target_train', type=str, default='')
    
    
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=10)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--effective_batch_size',type=int, default=64)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)
    parser.add_argument('--bsz', type=int, default = 20)
    parser.add_argument('--bptt', type=int, default = 40)

    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
#    print(args)

    model_type = 'gpt2'


    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

#    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = torch.device(type='cuda')
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

#    if not args.do_train and not args.do_eval:
#        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
        

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2').to('cuda')

    model.to(device)


    #file_train = args.train_dataset #'cnn_train.txt'
    #file_eval =  args.eval_dataset #'cnn_valid.txt'
    bptt = args.bptt
    bsz = args.bsz
    

#    X_eval, nbatch_eval = load_dataset(file_eval, tokenizer, bptt, bsz)
#    X_train, nbatch_train =  load_dataset(file_train, tokenizer, bptt, bsz)
    
    batches_eval, labels_eval, nbatch_eval = load_dataset(args.source_eval, args.target_eval, tokenizer, bptt, bsz)
    batches_train, labels_train, nbatch_train =  load_dataset(args.source_train, args.target_train, tokenizer, bptt, bsz)
    
    

    # Prepare optimizer
#    param_optimizer = list(model.parameters())
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    
    print('here 3')
#    num_train_optimization_steps = len(train_data) * args.num_train_epochs // args.train_batch_size
    num_train_optimization_steps = nbatch_train * args.num_train_epochs
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    eval_loss_min = None
    print('here 4')
    model.to(device)

    nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
    model.train()
    for epoch_i in trange(int(args.num_train_epochs), desc="Epoch"):
        tr_loss = 0
        nb_tr_steps = 0
        
        for i_batch in tqdm(list(range(nbatch_train)), desc='Evaluating epoch {}'.format(epoch_i)):
            batch = batches_train[i_batch]#X_train[:, i_batch*bsz:(1+i_batch)*bsz].permute(1,0)
            
            batch = batch.cuda()
            lm_labels = labels_train[i_batch].cuda()
            if batch.numel() == 0:
                break
            
            #loss = model(batch, lm_labels = labels_train[i_batch].cuda())
                            # TRY DOING IT MANUALLY
            loss_fct = CrossEntropyLoss(reduction = 'none')
            lm_logits,_ = model(batch)
            shift_logits = lm_logits[:, :-1, :].contiguous()
            shift_labels = batch[:,1:].contiguous()
            
            shift_labels_mask = (lm_labels[:,1:].contiguous().view(-1) != -1).float()
            
            loss_mat = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                        shift_labels.view(-1))
            loss = (loss_mat*shift_labels_mask).view(-1).sum()/shift_labels_mask.sum() # avg over non-masked indices
            
            loss.backward()
            
            # only step the model if you've gone through 'effective_batch_size' examples
            if (i_batch*args.train_batch_size) % args.effective_batch_size == 0 and i_batch != 0:
                optimizer.step()
                optimizer.zero_grad()
                
            tr_loss += loss.item()
            

            exp_average_loss = loss.item() if exp_average_loss is None else 0.7*exp_average_loss+0.3*loss.item()
            nb_tr_steps += 1
         
            
            
            ###
            # Evaluations
            ###
            
            
            if i_batch % 1000 == 0: # get eval score
                eval_loss = eval_model(model, nbatch_eval,batches_eval,labels_eval, bsz)
                
                # if eval_loss improves, save model
                if eval_loss_min is None or eval_loss < eval_loss_min:
                    eval_loss_min = eval_loss
                    
                    # save model if eval loss is lower
                    model_to_save = model
                    # If we save using the predefined names, we can load using `from_pretrained`
                    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
                    output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        
                    torch.save(model_to_save.state_dict(), output_model_file)
                    to_json_file(model_to_save.config,output_config_file)
                
                print('eval_loss {}',format(eval_loss))
                model.train()
                
            if i_batch % 200 == 0: # try generating from model 
                print("Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, optimizer.get_lr()[0]))

                model.eval()
                if model_type == 'gpt':
                    encode = lambda a: tokenizer.convert_tokens_to_ids(tokenizer.tokenize(a))
                    decode = tokenizer.decode
                elif model_type == 'gpt2':
                    encode = tokenizer.encode
                    decode = tokenizer.decode
                
                generate_from_model(encode, decode, model = model,model_type = model_type)
                model.train()
Example #26
0
 def train(self):
     if self.debug_mode: self.epochs = 1
     # 加载 dataloader
     train_loader, valid_loader = self.create_dataloader()
     # 训练
     self.seed_everything()
     lr = 2e-5
     accumulation_steps = math.ceil(self.batch_size / self.base_batch_size)
     # 加载预训练模型
     print("Load pre-trained model")
     model = GPT2NeuralNet.from_pretrained(self.gpt2_model_path,
                                           cache_dir=None)
     model.zero_grad()
     model = model.to(self.device)
     """
     # 不同的参数组设置不同的 weight_decay
     param_optimizer = list(model.named_parameters())
     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
     optimizer_grouped_parameters = [
         {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
         {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
     ]
     """
     epoch_steps = int(self.train_len * 0.5 / self.base_batch_size /
                       accumulation_steps)
     num_train_optimization_steps = int(self.epochs * epoch_steps)
     valid_every = math.floor(epoch_steps * accumulation_steps / 5)
     optimizer = OpenAIAdam(model.parameters(),
                            lr=lr,
                            warmup=0.05,
                            t_total=num_train_optimization_steps)
     # 渐变学习速率
     #scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
     model, optimizer = amp.initialize(model,
                                       optimizer,
                                       opt_level="O1",
                                       verbosity=0)
     # 开始训练
     print("Train")
     best_auc_score_1 = 0
     best_auc_score_2 = 0
     best_auc_score_3 = 0
     best_auc_score_4 = 0
     f_log = open("train_log.txt", "w")
     for epoch in range(self.epochs):
         model.train()
         optimizer.zero_grad()
         # 加载每个 batch 并训练
         train_start_time = time.time()
         for i, batch_data in enumerate(train_loader):
             x_batch = batch_data[0]
             y_batch = batch_data[1]
             target_weight_batch = batch_data[2]
             aux_weight_batch = batch_data[3]
             identity_weight_batch = batch_data[4]
             np_weight_batch = batch_data[5]
             np_identity_weight_batch = batch_data[6]
             y_pred = model(x_batch.to(self.device))
             target_loss, aux_loss, identity_loss, np_loss = self.custom_loss(
                 y_pred, y_batch, epoch, target_weight_batch,
                 aux_weight_batch, identity_weight_batch, np_weight_batch)
             loss = target_loss + aux_loss + identity_loss + np_loss
             with amp.scale_loss(loss, optimizer) as scaled_loss:
                 scaled_loss.backward()
             if (i + 1) % accumulation_steps == 0:
                 optimizer.step()
                 optimizer.zero_grad()
             # 验证
             if (i + 1) % valid_every == 0:
                 model.eval()
                 stage = int((i + 1) / valid_every)
                 train_stage_duration = int(
                     (time.time() - train_start_time) / 60)
                 valid_start_time = time.time()
                 y_pred = np.zeros((len(self.train_df) - self.train_len))
                 for j, valid_batch_data in enumerate(valid_loader):
                     x_batch = valid_batch_data[0]
                     batch_y_pred = self.sigmoid(
                         model(x_batch.to(
                             self.device)).detach().cpu().numpy())[:, 0]
                     y_pred[j * self.base_batch_size:(j + 1) *
                            self.base_batch_size] = batch_y_pred
                 # 计算得分
                 auc_score = self.evaluator.get_final_metric(y_pred)
                 valid_duration = int((time.time() - valid_start_time) / 60)
                 train_start_time = time.time()
                 f_log.write(
                     "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f\n"
                     % (epoch, stage, train_stage_duration, valid_duration,
                        auc_score))
                 print(
                     "epoch: %d stage: %d train_stage_duration: %dmin valid_duration: %dmin auc_score: %.4f"
                     % (epoch, stage, train_stage_duration, valid_duration,
                        auc_score))
                 if auc_score > best_auc_score_4:
                     state_dict = model.state_dict()
                     if auc_score > best_auc_score_1:
                         best_auc_score_1 = auc_score
                         torch.save(state_dict, "model1.bin")
                     elif auc_score > best_auc_score_2:
                         best_auc_score_2 = auc_score
                         torch.save(state_dict, "model2.bin")
                     elif auc_score > best_auc_score_3:
                         best_auc_score_3 = auc_score
                         torch.save(state_dict, "model3.bin")
                     else:
                         best_auc_score_4 = auc_score
                         torch.save(state_dict, "model4.bin")
                     with open("model_score.txt", "w") as f:
                         f.write(
                             "model1: %.4f model2: %.4f model3: %.4f model4: %.4f"
                             % (best_auc_score_1, best_auc_score_2,
                                best_auc_score_3, best_auc_score_4))
                     print(
                         "model1: %.4f model2: %.4f model3: %.4f model4: %.4f"
                         % (best_auc_score_1, best_auc_score_2,
                            best_auc_score_3, best_auc_score_4))
                 model.train()
         if self.last is True:
             state_dict = model.state_dict()
             torch.save(state_dict, "model_last.bin")
     # del 训练相关输入和模型
     training_history = [train_loader, valid_loader, model, optimizer]
     for variable in training_history:
         del variable
     gc.collect()
Example #27
0
def train():
    config_file = "configs/train_daily_dialog_emotion_action_config.json"
    config = Config.from_json_file(config_file)

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if config.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", config.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(config))

    # Initialize distributed training if needed
    config.distributed = (config.local_rank != -1)
    if config.distributed:
        torch.cuda.set_device(config.local_rank)
        config.device = torch.device("cuda", config.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )
    tokenizer_class = GPT2Tokenizer if "gpt2" in config.model_checkpoint else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(config.model_checkpoint)
    model_class = GPT2DoubleHeadsModel if "gpt2" in config.model_checkpoint else OpenAIGPTDoubleHeadsModel
    model = model_class.from_pretrained(config.model_checkpoint)
    tokenizer.set_special_tokens(SPECIAL_TOKENS)
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.to(config.device)
    optimizer = OpenAIAdam(model.parameters(), lr=config.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if config.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=config.fp16)
    if config.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[config.local_rank],
                                        output_device=config.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        config, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = tuple(
            input_tensor.to(config.device) for input_tensor in batch)
        lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels,
                                 token_type_ids, token_emotion_ids,
                                 token_action_ids)
        loss = (lm_loss * config.lm_coef +
                mc_loss * config.mc_coef) / config.gradient_accumulation_steps
        if config.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           config.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_norm)
        if engine.state.iteration % config.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(config.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids, token_emotion_ids, token_action_ids = batch
            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            model_outputs = model(input_ids,
                                  mc_token_ids,
                                  token_type_ids=token_type_ids,
                                  token_emotion_ids=token_emotion_ids,
                                  token_action_ids=token_action_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[
                1]  # So we can also use GPT2 outputs
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted,
                    mc_logits), (lm_labels_flat_shifted, mc_labels)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if config.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if config.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if config.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, config.lr),
                                 (config.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-1),
             output_transform=lambda x: (x[0][0], x[1][0])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], config),
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], config)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if config.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=config.log_dir)
        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" take care of distributed encapsulation

        torch.save(config,
                   tb_logger.writer.log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(
            os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=config.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if config.local_rank in [-1, 0] and config.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache',
                        help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="/home/rohola/logs",
                        help="Path, url or short name of the model")
    #parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model")
    parser.add_argument("--num_candidates",
                        type=int,
                        default=2,
                        help="Number of candidates for training")
    parser.add_argument("--max_history",
                        type=int,
                        default=2,
                        help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=2,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=1,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=8,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr",
                        type=float,
                        default=6.25e-5,
                        help="Learning rate")
    parser.add_argument("--lm_coef",
                        type=float,
                        default=1.0,
                        help="LM loss coefficient")
    parser.add_argument("--mc_coef",
                        type=float,
                        default=1.0,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=20,
                        help="Number of training epochs")
    parser.add_argument("--personality_permutations",
                        type=int,
                        default=1,
                        help="Number of permutations of personality sentences")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cpu" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    parser.add_argument(
        "--log_dir",
        type=str,
        default="",
        help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info(
        "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning"
    )
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model_class = GPT2DoubleHeadsModel if "gpt2" in args.model_checkpoint else OpenAIGPTDoubleHeadsModel
    model = model_class.from_pretrained(args.model_checkpoint)
    tokenizer.set_special_tokens(SPECIAL_TOKENS)
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.to(args.device)
    optimizer = OpenAIAdam(model.parameters(), lr=args.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    # train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)
    #
    # def inference(engine, batch):
    #     model.eval()
    #     with torch.no_grad():
    #         batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
    #         input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
    #         model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids)
    #         lm_logits, mc_logits, label = model_outputs[0], model_outputs[1], model_outputs[2]
    #         if label!=19:
    #             print(tokenizer.decode(input_ids[0, -1, :].tolist()))
    #             print(tokenizer.decode(input_ids[0, label, :].tolist()))
    #         lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
    #         lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
    #         return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
    #
    # evaluator = Engine(inference)
    #
    # evaluator.run(val_loader)

    def tokenize_sentence(s):
        bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(
            SPECIAL_TOKENS[:-1])
        s1_ids = [bos] + tokenizer.convert_tokens_to_ids(
            tokenizer.tokenize(s)) + [eos]
        input_ids = torch.Tensor(s1_ids).type(
            torch.int64).unsqueeze(0).unsqueeze(0)
        return input_ids

    def compare_sentences_using_openai_embedding():
        s1 = "I love biking"
        s2 = "I want to buy a bicycle"
        s3 = "It is wrong"

        input_ids = tokenize_sentence(s1)
        hidden_states1 = model(input_ids, mc_token_ids=None)
        hidden_states1 = hidden_states1.squeeze(0).squeeze(0).mean(dim=0)

        input_ids = tokenize_sentence(s2)
        hidden_states2 = model(input_ids, mc_token_ids=None)
        hidden_states2 = hidden_states2.squeeze(0).squeeze(0).mean(dim=0)

        input_ids = tokenize_sentence(s3)
        hidden_states3 = model(input_ids, mc_token_ids=None)
        hidden_states3 = hidden_states3.squeeze(0).squeeze(0).mean(dim=0)

        print()

    compare_sentences_using_openai_embedding()
Example #29
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model")
    parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps")
    parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate")
    parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient")
    parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs")
    parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences")
    parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
    tokenizer_class = GPT2Tokenizer if "gpt2" in args.model_checkpoint else OpenAIGPTTokenizer
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model_class = GPT2LMHeadModel if "gpt2" in args.model_checkpoint else OpenAIGPTLMHeadModel
    model = model_class.from_pretrained(args.model_checkpoint)
    tokenizer.set_special_tokens(SPECIAL_TOKENS)
    model.set_num_special_tokens(len(SPECIAL_TOKENS))
    model.to(args.device)
    optimizer = OpenAIAdam(model.parameters(), lr=args.lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(args, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        lm_loss, mc_loss = model(*batch)
        loss = (lm_loss * args.lm_coef + mc_loss * args.mc_coef) / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()
    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[1]  # So we can also use GPT2 outputs
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics 
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
                    "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir=None)
        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {'mymodel': getattr(model, 'module', model)})  # "getattr" take care of distributed encapsulation

        torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Example #30
0
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':
    0.01
}, {
    'params':
    [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
    'weight_decay':
    0.0
}]

num_train_optimization_steps = int(FACTOR * 2 * EPOCHS *
                                   len(train_loader.tensors[0]) / batch_size /
                                   accumulation_steps)

optimizer = OpenAIAdam(optimizer_grouped_parameters,
                       lr=lr,
                       warmup=WARMUP,
                       t_total=num_train_optimization_steps)

model, optimizer = amp.initialize(model,
                                  optimizer,
                                  opt_level="O1",
                                  verbosity=0)

#######################
# multi-gpu
#######################
model = nn.DataParallel(model, device_ids=[0, 1, 2, 3, 4, 5])

model = model.train()

del param_optimizer, optimizer_grouped_parameters