Example #1
0
        def create_and_check_double_lm_head_model(
            self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
        ):
            model = GPT2DoubleHeadsModel(config)
            model.to(torch_device)
            model.eval()

            multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
            multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()

            inputs = {
                "input_ids": multiple_choice_inputs_ids,
                "mc_token_ids": mc_token_ids,
                "attention_mask": multiple_choice_input_mask,
                "token_type_ids": multiple_choice_token_type_ids,
                "lm_labels": multiple_choice_inputs_ids,
            }

            loss, lm_logits, mc_logits, _ = model(**inputs)

            result = {"loss": loss, "lm_logits": lm_logits, "mc_logits": mc_logits}

            self.parent.assertListEqual(list(result["loss"].size()), [])
            self.parent.assertListEqual(
                list(result["lm_logits"].size()),
                [self.batch_size, self.num_choices, self.seq_length, self.vocab_size],
            )
            self.parent.assertListEqual(list(result["mc_logits"].size()), [self.batch_size, self.num_choices])
Example #2
0
    def create_and_check_double_lm_head_model(
        self, config, input_ids, input_mask, head_mask, token_type_ids, mc_token_ids, *args
    ):
        model = GPT2DoubleHeadsModel(config)
        model.to(torch_device)
        model.eval()

        multiple_choice_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
        multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
        multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()

        inputs = {
            "input_ids": multiple_choice_inputs_ids,
            "mc_token_ids": mc_token_ids,
            "attention_mask": multiple_choice_input_mask,
            "token_type_ids": multiple_choice_token_type_ids,
            "labels": multiple_choice_inputs_ids,
        }

        result = model(**inputs)
        self.parent.assertEqual(result.loss.shape, ())
        self.parent.assertEqual(
            result.logits.shape, (self.batch_size, self.num_choices, self.seq_length, self.vocab_size)
        )
        self.parent.assertEqual(result.mc_logits.shape, (self.batch_size, self.num_choices))
Example #3
0
    def test_batch_generation_2heads(self):
        model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
        model.to(torch_device)
        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

        tokenizer.padding_side = "left"

        # This tokenizer has no pad token, so we have to set it in some way
        # Define PAD Token = EOS Token = 50256
        tokenizer.pad_token = tokenizer.eos_token
        model.config.pad_token_id = model.config.eos_token_id

        # use different length sentences to test batching
        sentences = [
            "Hello, my dog is a little",
            "Today, I",
        ]

        inputs = tokenizer(sentences, return_tensors="pt", padding=True)
        input_ids = inputs["input_ids"].to(torch_device)
        token_type_ids = torch.cat(
            [
                input_ids.new_full((input_ids.shape[0], input_ids.shape[1] - 1), 0),
                input_ids.new_full((input_ids.shape[0], 1), 500),
            ],
            dim=-1,
        )

        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=inputs["attention_mask"].to(torch_device),
        )

        outputs_tt = model.generate(
            input_ids=input_ids,
            attention_mask=inputs["attention_mask"].to(torch_device),
            token_type_ids=token_type_ids,
        )

        inputs_non_padded = tokenizer(sentences[0], return_tensors="pt").input_ids.to(torch_device)
        output_non_padded = model.generate(input_ids=inputs_non_padded)

        num_paddings = inputs_non_padded.shape[-1] - inputs["attention_mask"][-1].long().sum().cpu().item()
        inputs_padded = tokenizer(sentences[1], return_tensors="pt").input_ids.to(torch_device)
        output_padded = model.generate(input_ids=inputs_padded, max_length=model.config.max_length - num_paddings)

        batch_out_sentence = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        batch_out_sentence_tt = tokenizer.batch_decode(outputs_tt, skip_special_tokens=True)
        non_padded_sentence = tokenizer.decode(output_non_padded[0], skip_special_tokens=True)
        padded_sentence = tokenizer.decode(output_padded[0], skip_special_tokens=True)

        expected_output_sentence = [
            "Hello, my dog is a little bit of a mess. I'm not sure if he's going",
            "Today, I'm going to be doing a lot of research on this. I",
        ]
        self.assertListEqual(expected_output_sentence, batch_out_sentence)
        self.assertTrue(batch_out_sentence_tt != batch_out_sentence)  # token_type_ids should change output
        self.assertListEqual(expected_output_sentence, [non_padded_sentence, padded_sentence])
Example #4
0
 def __init__(self, hparams: Namespace):
     super().__init__()
     self.hparams = hparams
     # GPT2 is going to be frozen and fixed!
     # because of that we hide it inside the DataModule
     self.gpt2 = GPT2DoubleHeadsModel.from_pretrained(
         self.hparams.pretrained_model)
     self.tokenizer = Tokenizer(self.hparams.pretrained_model)
     # Resize embeddings to include the added tokens
     self.gpt2.resize_token_embeddings(self.tokenizer.vocab_size)
Example #5
0
def get_model_tokenizer():
    global model
    global tokenizer
    if model is None:
        # Load trained model
        model = GPT2DoubleHeadsModel.from_pretrained(trained_model_path)
        # Convert model parameter tensors to device
        model.to("cpu")
        # Load trained Tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained(trained_model_path)

    return model, tokenizer
def main(args):
    """
  Execute the summarization from fine-tuned GPT2 model (given in arguments CLI)
  write the summary.txt file
  """

    model = GPT2DoubleHeadsModel.from_pretrained(args.model_directory)
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_directory)

    # Add a [CLS] to the vocabulary (we should train it also!)
    special_tokens = {
        'bos_token': '<|startoftext|>',
        'eos_token': '<|endoftext|>',
        'pad_token': '<pad>',
        'additional_special_tokens': ['<|keyword|>', '<|summarize|>']
    }
    tokenizer.add_special_tokens(special_tokens)
    assert len(tokenizer) == 50261, "tokenizer size is not 50261"
    model.resize_token_embeddings(len(tokenizer))
    print(' ')

    file1 = open(args.input_file, 'r')
    input_text = file1.read()
    file1.close()

    model = model.to(device)
    input_text = '<|startoftext|> ' + input_text + ' <|summarize|>'
    input_token = tokenizer.encode(input_text)
    input_token_torch = torch.tensor(input_token, dtype=torch.long)

    generated_output = model.generate(
        input_ids=input_token_torch.unsqueeze(0).to(device),
        max_length=args.max_length + len(input_token),
        min_length=args.min_length + len(input_token),
        temperature=args.temperature,
        decoder_start_token_id='<|summarize|>',
        top_k=args.top_k,
        top_p=args.top_p,
        repetition_penalty=None,
        do_sample=True,
        num_return_sequences=args.num_return_sequences)
    batch_answer = []
    for item in generated_output:
        batch_answer.append(
            tokenizer.decode(item[len(input_token):],
                             skip_special_tokens=True))
    f = open("summary.txt", "a")
    f.writelines(batch_answer)
    f.close()
    def __init__(self, context):
        super(TransferLearning, self).__init__(context)
        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
        self.model = GPT2DoubleHeadsModel.from_pretrained('gpt2-medium')

        # ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
        # ``additional_special_tokens``
        self.special_tokens = {
            'bos_token': "<bos>",
            'eos_token': "<eos>",
            'additional_special_tokens': ["<speaker1>", "<speaker2>"],
            'pad_token': "<pad>"
        }

        self.tokenizer.add_special_tokens(self.special_tokens)
        self.model.resize_token_embeddings(self.tokenizer.vocab_size)
Example #8
0
    def __init__(self, hparams: Namespace):
        super().__init__()
        self.hparams = hparams
        # GPT2 is going to be frozen and fixed!
        # because of that we hide it inside the DataModule
        self.gpt2 = GPT2DoubleHeadsModel.from_pretrained(
            self.hparams.pretrained_model)
        self.tokenizer = Tokenizer(self.hparams.pretrained_model)
        # Resize embeddings to include the added tokens
        self.gpt2.resize_token_embeddings(self.tokenizer.vocab_size)

        ## Quantize
        if self.hparams.quantize:
            emb_qconf = torch.quantization.float_qparams_weight_only_qconfig
            self.gpt2.transformer.wte.qconfig = emb_qconf
            self.gpt2.transformer.wpe.qconfig = emb_qconf
Example #9
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))
    
    if args.model_checkpoint == "":
        raise ValueError("Requiring a finetuned model_checkpoint")
	
    if args.seed != 0:
    	random.seed(args.seed)
    	torch.random.manual_seed(args.seed)
    	torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    config = GPT2Config(vocab_size=50003)
    model = GPT2DoubleHeadsModel(config)
    if args.model_checkpoint:
        print("\nLoad model from", args.model_checkpoint)
        model.load_state_dict(torch.load(args.model_checkpoint), strict=False)

    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    history = ''
    print('\nPlease input a sentece to chat with the chatbot!')
    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history = tokenizer.tokenize(raw_text)
        with torch.no_grad():
            out_ids = sample_sequence(history, tokenizer, model, args)
        print(tokenizer.convert_ids_to_tokens(out_ids))
Example #10
0
# Functions and Models Prepared

#===============================================================================================#
device = torch.device("cpu")

GPT2_directory = 'Models'
tokenizer_GPT2 = GPT2Tokenizer.from_pretrained(GPT2_directory)
special_tokens = {
    'bos_token': '<|startoftext|>',
    'eos_token': '<|endoftext|>',
    'pad_token': '<pad>',
    'additional_special_tokens': ['<|keyword|>', '<|summarize|>']
}
tokenizer_GPT2.add_special_tokens(special_tokens)
GPT2_generator = GPT2DoubleHeadsModel.from_pretrained(GPT2_directory)

device = torch.device("cpu")
use_GPU_GPT_generator = False
if use_GPU_GPT_generator:
    GPT2_generator = GPT2_generator.to(device)
    GPT2_input_torch = GPT2_input_torch.to(device)

list_keywords = get_keywords(text)

GPT2_input = tokenizer_GPT2.encode('<|startoftext|> ' + title + list_keywords +
                                   ' <|summarize|> ')
GPT2_input_torch = torch.tensor(GPT2_input, dtype=torch.long)

temperature = 1
greedy_search = False
def main(args):
    """
  executing the training given the arguments in CLI
  output:
    write pytorch model file, and config files
    write training and validation statistics (in .json)
  """
    train_dict = {'lm_loss': [], 'mc_loss': [], 'total_loss': []}
    val_dict = {'lm_loss': [], 'mc_loss': [], 'total_loss': []}

    if args.model_directory == None:
        model = GPT2DoubleHeadsModel.from_pretrained('distilgpt2')
        tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
        special_tokens = {
            'bos_token': '<|startoftext|>',
            'eos_token': '<|endoftext|>',
            'pad_token': '<pad>',
            'additional_special_tokens': ['<|keyword|>', '<|summarize|>']
        }
        print('total length of vocab should be 50261 = ', len(tokenizer))
        model.resize_token_embeddings(len(tokenizer))
        print('resize the model embedding layer')
    else:
        model = GPT2DoubleHeadsModel.from_pretrained(args.model_directory)
        tokenizer = GPT2Tokenizer.from_pretrained(args.model_directory)
        special_tokens = {
            'bos_token': '<|startoftext|>',
            'eos_token': '<|endoftext|>',
            'pad_token': '<pad>',
            'additional_special_tokens': ['<|keyword|>', '<|summarize|>']
        }
        print('total length of vocab should be 50261 = ', len(tokenizer))

    # Add a [CLS] to the vocabulary (we should train it also!)
    print(' ')

    train_dataset = torch.load(args.train_data)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=1)
    print('finished downloading train dataset')

    val_dataset = torch.load(args.val_data)
    val_sampler = RandomSampler(val_dataset)
    val_dataloader = DataLoader(val_dataset, sampler=val_sampler, batch_size=1)
    print('finished downloading vallidation dataset')

    model = model.to(device)
    optimizer = AdamW(model.parameters(),
                      lr=args.learning_rate,
                      eps=args.eps,
                      correct_bias=True)
    total_steps = len(train_dataloader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.scheduler_warmup,
        num_training_steps=total_steps)

    for epoch in range(args.epochs):
        start = timeit.default_timer()
        start_iter = timeit.default_timer()
        for iterations, batch in enumerate(train_dataloader):
            lm_loss, mc_loss, total_loss = train(args, batch, iterations,
                                                 model, optimizer, scheduler)
            train_dict['lm_loss'].append(lm_loss)
            train_dict['mc_loss'].append(mc_loss)
            train_dict['total_loss'].append(total_loss)
            if iterations % args.print_every == 0:
                stop_iter = timeit.default_timer()
                print(
                    "Trainer Results - epoch {} - LM loss: {:.2f} MC loss: {:.2f} total loss: {:.2f} report time: {:.1f} sec"
                    .format(iterations, train_dict['lm_loss'][-1],
                            train_dict['mc_loss'][-1],
                            train_dict['total_loss'][-1],
                            stop_iter - start_iter))
                start_iter = timeit.default_timer()

        print('end-of-training-epoch')
        stop = timeit.default_timer()
        print(
            "Trainer Results - epoch {} - LM loss: {:.2f} MC loss: {:.2f} total loss: {:.2f} report time: {:.1f} sec"
            .format(epoch, train_dict['lm_loss'][-1],
                    train_dict['mc_loss'][-1], train_dict['total_loss'][-1],
                    stop - start))
        print(' ')
        for iterations, batch in enumerate(val_dataloader):
            lm_loss, mc_loss, total_loss = evaluate(args, batch, model)
            val_dict['lm_loss'].append(lm_loss)
            val_dict['mc_loss'].append(mc_loss)
            val_dict['total_loss'].append(total_loss)

        print('end-of-validation-epoch')
        stop_eval = timeit.default_timer()
        print(
            "Evaluator Results - epoch {} - LM loss: {:.2f} MC loss: {:.2f} total loss: {:.2f} report time: {:.1f} sec"
            .format(epoch, val_dict['lm_loss'][-1], val_dict['mc_loss'][-1],
                    val_dict['total_loss'][-1], stop_eval - stop))
        print(' ')
    model.config.to_json_file(args.model_name + '/config.json')
    tokenizer.save_vocabulary(args.model_name)
    model_file = args.model_name + '/pytorch_model.bin'
    torch.save(model.state_dict(), model_file)
    with open(
            args.model_name + '/training_loss_' + str(args.epochs) +
            '_epoch.json', 'w') as fp:
        json.dump(train_dict, fp)
    with open(
            args.model_name + '/validation_loss_' + str(args.epochs) +
            '_epoch.json', 'w') as fq:
        json.dump(val_dict, fq)
Example #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name",
                        type=str,
                        default="gpt2",
                        help="pretrained model name")
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument("--train_dataset", type=str, default="")
    parser.add_argument("--eval_dataset", type=str, default="")
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--num_train_epochs", type=int, default=3)
    parser.add_argument("--train_batch_size", type=int, default=16)
    parser.add_argument("--eval_batch_size", type=int, default=16)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", type=int, default=1)
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.",
    )
    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--lm_coef", type=float, default=0.5)
    parser.add_argument("--n_valid", type=int, default=374)

    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading function also adds new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ["_start_", "_delimiter_", "_classify_"]
    try:
        # Load a trained model and vocabulary that you have fine-tuned
        model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir)
    except:
        model = GPT2DoubleHeadsModel.from_pretrained(args.model_name)
        tokenizer = GPT2Tokenizer.from_pretrained(args.model_name)
    tokenizer.add_tokens(special_tokens)
    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
    model.resize_token_embeddings(len(tokenizer))
    model.to(device)

    # Load and encode the datasets
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(
        len(story[:max_length]) +
        max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
        for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps // (
                len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(
                train_dataloader
            ) // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in param_optimizer
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args.weight_decay,
            },
            {
                "params": [
                    p for n, p in param_optimizer
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args.warmup_steps,
            num_training_steps=t_total)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids,
                               mc_token_ids=mc_token_ids,
                               lm_labels=lm_labels,
                               mc_labels=mc_labels)
                loss = args.lm_coef * losses[0] + losses[
                    1]  # LM loss * coef + MC loss
                loss.backward()
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = (loss.item() if exp_average_loss is None
                                    else 0.7 * exp_average_loss +
                                    0.3 * loss.item())
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    scheduler.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, "module") else model  # Only save the model itself

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_vocabulary(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        logger.info("Saving model to %s", args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss, _, mc_logits, _ = model(input_ids,
                                                    mc_token_ids=mc_token_ids,
                                                    lm_labels=lm_labels,
                                                    mc_labels=mc_labels)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to("cpu").numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            "eval_loss": eval_loss,
            "eval_accuracy": eval_accuracy,
            "train_loss": train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.__version__

import transformers
from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel, AdamW
print('use transformers version = ',transformers.__version__) # make sure it is 2.6.0

load_model = False
load_previous_weight = False
resize_model = False


### 1 Pretrained Model setup ###
################################

model = GPT2DoubleHeadsModel.from_pretrained('distilgpt2')
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<|keyword|>','<|summarize|>']}

print(len(tokenizer), 'total length of vocab') # expect 50257

special_tokens = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','pad_token':'<pad>','additional_special_tokens':['<|keyword|>','<|summarize|>']}
#special_tokens2 = {'bos_token':'<|startoftext|>','eos_token':'<|endoftext|>','keyword_token':'<|keyword|>','summary_token':'<|summarize|>'}
tokenizer.add_special_tokens(special_tokens)
model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
# The newly token the last token of the vocabulary
resize_model = True

print(len(tokenizer), 'total length of vocab')
print(tokenizer.bos_token_id, 'bos_token')
print(tokenizer.eos_token_id, 'eos_token')
Example #14
0
def train():
    parser = ArgumentParser()
    parser.add_argument(
        "--dataset_path",
        type=str,
        default="",
        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="gpt2",
                        help="Path, url or short name of the model")
    parser.add_argument("--num_candidates",
                        type=int,
                        default=2,
                        help="Number of candidates for training")
    parser.add_argument("--train_batch_size",
                        type=int,
                        default=16,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=4,
                        help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=1,
                        help="Accumulate gradients on several steps")
    parser.add_argument("--lr", type=float, default=1e-4, help="Learning rate")
    parser.add_argument("--lm_coef",
                        type=float,
                        default=1.0,
                        help="LM loss coefficient")
    parser.add_argument("--mc_coef",
                        type=float,
                        default=1.0,
                        help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm",
                        type=float,
                        default=1.0,
                        help="Clipping gradient norm")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=3,
                        help="Number of training epochs")
    parser.add_argument(
        "--eval_before_start",
        action='store_true',
        help="If true start with a first evaluation before training")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--fp16",
        type=str,
        default="",
        help=
        "Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")

    parser.add_argument(
        "--init_model",
        default="model/pytorch_kogpt2_676e9bcfa7.params",
        type=str,
        help=
        "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch.",
    )

    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    logger.info("Prepare tokenizer, pretrained model and optimizer.")

    config = GPT2Config(vocab_size=50000)
    model = GPT2DoubleHeadsModel(config)
    if args.init_model:
        print("Load model from ", args.init_model)
        model.load_state_dict(torch.load(args.init_model), strict=False)

    model.to(args.device)
    add_special_tokens_(model, tokenizer)
    optimizer = AdamW(model.parameters(), lr=args.lr, correct_bias=True)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if args.fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16)
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(
        args, tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
        (lm_loss), (mc_loss), *_ = model(input_ids,
                                         token_type_ids=token_type_ids,
                                         mc_token_ids=mc_token_ids,
                                         mc_labels=mc_labels,
                                         lm_labels=lm_labels)
        loss = (lm_loss * args.lm_coef +
                mc_loss * args.mc_coef) / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                           args.max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(
                input_tensor.to(args.device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            # logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            # if we dont send labels to model, it doesnt return losses
            lm_logits, mc_logits, *_ = model(
                input_ids,
                token_type_ids=token_type_ids,
                mc_token_ids=mc_token_ids,
            )
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(
                -1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted,
                    mc_logits), (lm_labels_flat_shifted, mc_labels)

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr",
                                [(0, args.lr),
                                 (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {
        "nll":
        Loss(torch.nn.CrossEntropyLoss(ignore_index=-100),
             output_transform=lambda x: (x[0][0], x[1][0])),
        "accuracy":
        Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))
    }
    metrics.update({
        "average_nll":
        MetricsLambda(average_distributed_scalar, metrics["nll"], args),
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)
    })
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(
            Events.COMPLETED, lambda _: pbar.log_message(
                "Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.init_model)
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer,
                         log_handler=OutputHandler(tag="training",
                                                   metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer,
                         log_handler=OptimizerParamsHandler(optimizer),
                         event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator,
                         log_handler=OutputHandler(tag="validation",
                                                   metric_names=list(
                                                       metrics.keys()),
                                                   another_engine=trainer),
                         event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir,
                                             'checkpoint',
                                             save_interval=1,
                                             n_saved=3)
        trainer.add_event_handler(
            Events.EPOCH_COMPLETED, checkpoint_handler,
            {'mymodel': getattr(model, 'module', model)
             })  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        getattr(model, 'module',
                model).config.to_json_file(os.path.join(log_dir, CONFIG_NAME))

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            os.path.join(log_dir, checkpoint_handler._saved[-1][1]),
            os.path.join(log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
Example #15
0
 def __init__(self, pretrained_model_name_or_path, config):
     super(GPT2ForMultipleChoice, self).__init__()
     self.gpt2 = GPT2DoubleHeadsModel.from_pretrained(
         pretrained_model_name_or_path, config=config)
Example #16
0
def train(data_folder):
    checkpoint = False  # set to True if continuing to train our model, o/w false
    # set to True to chat with the unaltered GPT-2 model (at bottom of notebook)
    baseline = False
    model_file = '/gpt-2_epoch_0'

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
    csv_file = data_folder + '/processed_data_final.csv'

    genre_dict = {'comedy': '<comedy>',
                  'sport': '<sport>',
                  'biography': '<biography>',
                  'romance': '<romance>',
                  'action': '<action>',
                  'adventure': '<adventure>',
                  'drama': '<drama>',
                  'sci-fi': '<sci-fi>',
                  'family': '<family>',
                  'fantasy': '<fantasy>',
                  'musical': '<musical>',
                  'crime': '<crime>',
                  'thriller': '<thriller>',
                  'short': '<short>',
                  'western': '<western>',
                  'documentary': '<documentary>',
                  'horror': '<horror>',
                  'animation': '<animation>',
                  'film-noir': '<film-noir>',
                  'music': '<music>',
                  'war': '<war>',
                  'mystery': '<mystery>'}

    genres = genre_dict.keys()

    special_tokens = ["<speaker1>", "<speaker2>"] + \
        ["<" + genre + ">" for genre in genres]

    SPECIAL_TOKENS = {"bos_token": "<bos>", "eos_token": "<eos>",
                      "additional_special_tokens": special_tokens, "pad_token": "<pad>"}

    if not baseline:
        tokenizer.add_special_tokens(SPECIAL_TOKENS)
        model.resize_token_embeddings(len(tokenizer))

    if not baseline:
        ngpu = 0
        for param in model.parameters():
            param.requires_grad = False

        # Parameters of newly constructed modules have requires_grad=True by default
        model.lm_head = nn.Linear(model.lm_head.in_features, len(tokenizer))
        model.multiple_choice_head.summary = nn.Linear(
            model.multiple_choice_head.summary.in_features, 1, bias=True)

    # retrain final fc layer and mc layer for language modeling task
    device = torch.device("cuda:0" if (
        torch.cuda.is_available() and ngpu > 0) else "cpu")

    model = model.to(device)

    if checkpoint:
        model.load_state_dict(torch.load(model_file))

    pkl_file = data_folder + '/dialogue_data.pkl'

    dataset = DialogueDataset(pkl_file=pkl_file)
    data_size = dataset.__len__()
    batch_size = 4
    train_size = .8
    shuffle_dataset = True
    #random_seed = random.randint(1, 10000)
    random_seed = 42

    # use indexing info from dataset for splitting groups
    gss = GroupShuffleSplit(n_splits=1, train_size=train_size,
                            random_state=random_seed)  # group stratified CV

    df = get_df_data(csv_file)
    for train_idx, val_idx in gss.split(df, df['sentence_2'], df['index']):
        train_sampler = SubsetRandomSampler(train_idx)
        valid_sampler = SubsetRandomSampler(val_idx)

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                    sampler=valid_sampler)

    # params
    lm_losses = []
    mc_losses = []
    total_losses = []

    lm_losses_val = []
    mc_losses_val = []
    total_losses_val = []

    iters = 0
    lm_coef = 2.0
    mc_coef = 1.0

    num_epochs = 3

    lr = 6.25e-5
    max_grad_norm = 1.0
    num_training_steps = (data_size // batch_size) * num_epochs
    warmup_proportion = 0.1
    num_warmup_steps = num_training_steps * .1

    grad_accum_steps = 8

    # In Transformers, optimizer and schedules are splitted and instantiated like this:
    # To reproduce BertAdam specific behavior set correct_bias=False
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps, num_training_steps)  # PyTorch scheduler
    #scheduler = PiecewiseLinear(optimizer, "lr", [(0, lr), (num_epochs * len(train_loader), 0.0)])

    print("Starting Training Loop...")
    min_total_loss = 4000
    # For each epoch
    for epoch in range(num_epochs):
        # checkpoints
        if epoch > 0:
            torch.save(model.state_dict(),
                       "/gpt-2_epoch_{}".format(epoch))
        # For each batch in the dataloader
        for i, data in enumerate(train_loader, 0):
            model.train()

            input_ids = data[0]
            token_type_ids = data[1]
            mc_token_ids = data[2]
            lm_labels = data[3]
            mc_labels = data[4]

            output = model(input_ids, mc_token_ids=mc_token_ids, mc_labels=mc_labels,
                           token_type_ids=token_type_ids, lm_labels=lm_labels)

            lm_loss = output[0]
            mc_loss = output[1]

            total_loss = lm_loss * lm_coef + mc_loss * mc_coef / grad_accum_steps

            total_loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            if i % grad_accum_steps == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            # Output training stats
            if i % 50 == 0:
                print('[%d/%d][%d/%d]\tLoss LM: %.4f\tLoss MC: %.4f\tLoss total:%.4f'
                      % (epoch, num_epochs, i, len(train_loader),
                         lm_loss.item(), mc_loss.item(), total_loss.item()))

            # Save Losses for plotting later
            lm_losses.append(lm_loss.item())
            mc_losses.append(mc_loss.item())
            total_losses.append(total_loss.item())

            curr_total_loss = total_loss.item()
            if curr_total_loss <= min_total_loss:
                min_total_loss = curr_total_loss
                best_model_wts = copy.deepcopy(model.state_dict())

            run.log('best_min_loss', np.float(min_total_loss))

            iters += 1
            break
        break

    return model
Example #17
0
def run():
    parser = ArgumentParser()
    parser.add_argument(
        "--model",
        type=str,
        default="openai-gpt",
        help="Model type (openai-gpt or gpt2)",
        choices=['openai-gpt',
                 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default="",
                        help="Path, url or short name of the model")
    parser.add_argument(
        "--max_history",
        type=int,
        default=2,
        help="Number of previous utterances to keep in history")
    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample",
                        action='store_true',
                        help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length",
                        type=int,
                        default=20,
                        help="Maximum length of the output utterances")
    parser.add_argument("--min_length",
                        type=int,
                        default=1,
                        help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature",
                        type=int,
                        default=0.7,
                        help="Sampling softmax temperature")
    parser.add_argument(
        "--top_k",
        type=int,
        default=0,
        help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument(
        "--top_p",
        type=float,
        default=0.9,
        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError(
                "Interacting with GPT2 requires passing a finetuned model_checkpoint"
            )
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    config = GPT2Config(vocab_size=50003)
    model = GPT2DoubleHeadsModel(config)
    if args.model_checkpoint:
        print("\tLoad model from ", args.model_checkpoint)
        model.load_state_dict(torch.load(args.model_checkpoint), strict=False)

    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    history = ''

    while True:
        raw_text = input(">>> ")
        while not raw_text:
            print('Prompt should not be empty!')
            raw_text = input(">>> ")
        history = tokenizer.tokenize(raw_text)
        result_set = set()
        for _ in range(0, 10):
            with torch.no_grad():
                out_ids = sample_sequence(history, tokenizer, model, args)

            result_set.add(tokenizer.convert_ids_to_tokens(out_ids))
        for result in result_set:
            print(result)