Beispiel #1
0
 def __init__(self, model_path: str = None) -> None:
     super().__init__()
     "Requires the BertTokenizer from pytorch_transformers"
     # pip install pytorch_transformers
     import os
     import torch
     from pytorch_transformers import BertTokenizer, cached_path
     from training.transformer_utils.model import TransformerWithClfHeadAndAdapters
     try:
         self.device = torch.device(
             "cuda:0" if torch.cuda.is_available() else "cpu")
         self.config = torch.load(
             cached_path(os.path.join(model_path,
                                      "model_training_args.bin")))
         self.model = TransformerWithClfHeadAndAdapters(
             self.config["config"],
             self.config["config_ft"]).to(self.device)
         state_dict = torch.load(cached_path(
             os.path.join(model_path, "model_weights.pth")),
                                 map_location=self.device)
         self.model.load_state_dict(state_dict)
         self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased',
                                                        do_lower_case=False)
     except:
         raise Exception(
             "Require a valid transformer model file ({0}/model_weights.pth) "
             "and its config file ({0}/model_training_args.bin).".format(
                 model_path))
Beispiel #2
0
def load_pretrained_model(args):
    "download pre-trained model and config"
    state_dict = torch.load(cached_path(os.path.join(args.model_checkpoint, "model_checkpoint.pth")),
                            map_location='cpu')
    config = torch.load(cached_path(os.path.join(args.model_checkpoint, "model_training_args.bin")))
    # Initialize model: Transformer base + classifier head
    model = TransformerWithClfHeadAndAdapters(config=config, fine_tuning_config=args).to(args.device)
    incompatible_keys = model.load_state_dict(state_dict, strict=False)
    print(f"Parameters discarded from the pretrained model: {incompatible_keys.unexpected_keys}")
    print(f"Parameters added in the model: {incompatible_keys.missing_keys}")

    if args.adapters_dim > 0:
        # Display adaptation parameters
        for name, param in model.named_parameters():
            if 'embeddings' not in name and 'classification' not in name and 'adapters_1' not in name and 'adapters_2' not in name:
                param.detach_()
                param.requires_grad = False
            else:
                param.requires_grad = True
        full_parameters = sum(p.numel() for p in model.parameters())
        trained_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)

        print(f"\nWe will train {trained_parameters:,} parameters out of {full_parameters:,}"
              f" (i.e. {100 * trained_parameters/full_parameters:.1f}%) of the full parameters")

    return model, state_dict, config
Beispiel #3
0
def get_dataset(tokenizer, dataset_path, dataset_cache):
    """ Get tokenized PERSONACHAT dataset from S3 or cache."""
    dataset_path = dataset_path or PERSONACHAT_URL
    dataset_cache = dataset_cache + '_' + type(
        tokenizer
    ).__name__  # To avoid using GPT cache for GPT-2 and vice-versa
    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        dataset = torch.load(dataset_cache)
    else:
        logger.info("Download dataset from %s", dataset_path)
        personachat_file = cached_path(dataset_path)
        with open(personachat_file, "r", encoding="utf-8") as f:
            dataset = json.loads(f.read())

        logger.info("Tokenize and encode the dataset")

        def tokenize(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, tokenize(o)) for n, o in obj.items())
            return list(tokenize(o) for o in obj)

        dataset = tokenize(dataset)
        torch.save(dataset, dataset_cache)
    return dataset
def get_dataset_personalities(tokenizer, dataset_path, dataset_cache=None):
    """ Get personalities from PERSONACHAT """
    dataset_path = dataset_path or PERSONACHAT_URL
    dataset_cache = dataset_cache + '_' + type(
        tokenizer
    ).__name__  # Do avoid using GPT cache for GPT-2 and vice-versa
    if os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        personachat = torch.load(dataset_cache)
    else:
        logger.info("Download PERSONACHAT dataset from %s", dataset_path)
        personachat_file = cached_path(dataset_path)
        with open(personachat_file, "r", encoding="utf-8") as f:
            personachat = json.loads(f.read())

        logger.info("Tokenize and encode the dataset")

        def tokenize(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, tokenize(o)) for n, o in obj.items())
            return list(tokenize(o) for o in obj)

        personachat = tokenize(personachat)
        torch.save(personachat, dataset_cache)

    logger.info("Filter personalities")
    personalities = []
    for dataset in personachat.values():
        for dialog in dataset:
            personalities.append(dialog["personality"])

    logger.info("Gathered {} personalities".format(len(personalities)))
    return personalities
 def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None,
                     *inputs, **kwargs):
     """
     Instantiate a PreTrainedBertModel from a pre-trained model file.
     Download and cache the pre-trained model file if needed.
     """
     if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
         vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
             pretrained_model_name_or_path]
     else:
         vocab_file = pretrained_model_name_or_path
     if os.path.isdir(vocab_file):
         vocab_file = os.path.join(vocab_file, VOCAB_NAME)
     # redirect to the cache, if necessary
     try:
         resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
     except EnvironmentError:
         return None
     if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
         # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
         # than the number of positional embeddings
         max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
             pretrained_model_name_or_path]
         kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
     # Instantiate tokenizer.
     tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
     return tokenizer
def download_pretrained_model():
    """ Download and extract finetuned model from S3 """
    resolved_archive_file = cached_path(HF_FINETUNED_MODEL)
    tempdir = tempfile.mkdtemp()
    #logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir))
    with tarfile.open(resolved_archive_file, 'r:gz') as archive:
        archive.extractall(tempdir)
    return tempdir
Beispiel #7
0
 def from_pretrained(
     cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs
 ):
     """
     Instantiate a PreTrainedBertModel from a pre-trained model file.
     Download and cache the pre-trained model file if needed.
     """
     if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
         vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
     else:
         vocab_file = pretrained_model_name_or_path
     if os.path.isdir(vocab_file):
         vocab_file = os.path.join(vocab_file, VOCAB_NAME)
     # redirect to the cache, if necessary
     try:
         resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
     except EnvironmentError:
         logger.error(
             "Model name '{}' was not found in model name list ({}). "
             "We assumed '{}' was a path or url but couldn't find any file "
             "associated to this path or url.".format(
                 pretrained_model_name_or_path,
                 ", ".join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                 vocab_file,
             )
         )
         return None
     if resolved_vocab_file == vocab_file:
         logger.info("loading vocabulary file {}".format(vocab_file))
     else:
         logger.info(
             "loading vocabulary file {} from cache at {}".format(
                 vocab_file, resolved_vocab_file
             )
         )
     if (
         pretrained_model_name_or_path
         in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP
     ):
         # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
         # than the number of positional embeddings
         max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
             pretrained_model_name_or_path
         ]
         kwargs["max_len"] = min(kwargs.get("max_len", int(1e12)), max_len)
     # Instantiate tokenizer.
     tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
     return tokenizer
Beispiel #8
0
def get_dataset(tokenizer, dataset_path, dataset_cache=None):
    """ Get PERSONACHAT from S3 """
    dataset_path = dataset_path or PERSONACHAT_URL
    dataset_cache = dataset_cache + '_' + type(tokenizer).__name__  # Do avoid using GPT cache for GPT-2 and vice-versa
    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        dataset = torch.load(dataset_cache)
    else:
        logger.info("Download dataset from %s", dataset_path)
        personachat_file = cached_path(dataset_path)
        with open(personachat_file, "r", encoding="utf-8") as f:
            dataset = json.loads(f.read())
        logger.info("Tokenize and encode the dataset")

        def tokenize(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                tokenize.dict_key_calls += 1
                logger.debug(
                    'Encoding {}. obj.keys() = {}, obj.items().__len__() = {}'.format(tokenize.dict_key_calls,
                                                                                      obj.keys(),
                                                                                      obj.items().__len__()))
                return dict((n, tokenize(o)) for n, o in obj.items())
            min_samples_for_multiprocessing = 100
            if obj.__len__() > min_samples_for_multiprocessing:
                logger.debug('  Encoding VERY LONG list of obj.__len__() = {}'.format(obj.__len__()))
                logger.debug('  Encoding list with with multiprocessing...')
                """functools.partial does not work becuase tokenizer has to be handed recusively together with obj to 
                worker_tokenize again. As a workaround of not knowing how to handle splash-operator for possible 
                dict-output and **kwargs input, the list_args is implemented."""
                with mp.Pool(processes=mp.cpu_count() - 1) as pool:
                    results = pool.map(func=worker_tokenize,
                                       iterable=[[o, tokenizer] for o in obj])
                return results
            else:
                logger.debug('  Encoding list of obj.__len__() = {}'.format(obj.__len__()))
                return list(tokenize(o) for o in obj)

        tokenize.dict_key_calls = 0

        dataset = tokenize(dataset)
        # dataset = tokenize(dataset)
        if dataset_cache:
            torch.save(dataset, dataset_cache)
    return dataset
Beispiel #9
0
def get_dataset(tokenizer,
                dataset_path,
                dataset_cache=None,
                task=None,
                return_cachepath=False):
    dataset_cache = dataset_cache + '_' + type(
        tokenizer
    ).__name__ + '_' + task  # Do avoid using GPT cache for GPT-2 and vice-versa
    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        dataset = torch.load(dataset_cache)
    else:
        logger.info("Download dataset from %s", dataset_path)
        dataset_file = cached_path(dataset_path)
        with open(dataset_file, "r", encoding="utf-8") as f:
            dataset = json.loads(f.read())

        logger.info("Tokenize and encode the dataset")

        def tokenize(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict(
                    (n, o) if (n == 'id' or n == 'turn_id') else (n,
                                                                  tokenize(o))
                    for n, o in obj.items())
            if isinstance(obj, int):
                return obj
            return list(tokenize(o) for o in obj)

        dataset = tokenize(dataset)
        if dataset_cache:
            torch.save(dataset, dataset_cache)
    if return_cachepath:
        return dataset, dataset_cache
    else:
        return dataset
Beispiel #10
0
 def download_dataset(self, dataset_path):
     # download personachat to dataset_path
     msg = "Downloading personachat from S3 into {}"
     logger.info(msg.format(dataset_path))
     return cached_path(PERSONACHAT_URL, dataset_path)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument("--max_steps",
                        default=-1,
                        type=int,
                        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.")
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
    tokenizer.add_special_tokens({
        'cls_token': '<CLS>',
        'sep_token': '<SEP>',
        'pad_token': '<PAD>',
        'eos_token': '<EOS>'
    })
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
    model.resize_token_embeddings(len(tokenizer))
    special_tokens_ids = [
        tokenizer.convert_tokens_to_ids(special_token)
        for special_token in ['<PAD>', '<CLS>', '<SEP>', '<EOS>']
    ]
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps //\
                (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader)\
                // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            args.weight_decay
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    scheduler.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids,
                                                 lm_labels, mc_labels)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def move_cached(name, cache_dir, out_path):
    cached_vocab = pt.cached_path(name, cache_dir=cache_dir)
    logger.info("Moving cached vocab {} to {}".format(cached_vocab, out_path))
    os.rename(cached_vocab, out_path)
    os.remove(cached_vocab + '.json')
def train(
    distributed=False, local_rank=-1, lr = 6.25e-5, dataset_path='../data/personachat_self_original.json', 
    dataset_cache=cached_path('../data/personachat_self_original.json'),
    model_checkpoint='gpt2', num_candidates=2, max_history=5, train_batch_size=2, valid_batch_size=2,
    gradient_accumulation_steps=8, lm_coef=1.0, mc_coef=1.0, max_norm=1.0, n_epochs=10, 
    personality_permutations=1, eval_before_start=False, device = 'cuda' if torch.cuda.is_available() else 'cpu',
    fp16=''
    ):
    '''
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model")
    parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history")
    parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training")
    parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation")
    parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps")
    parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate")
    parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient")
    parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient")
    parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm")
    parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs")
    parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences")
    parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)")
    parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)")
    parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()
    
    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes
    logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning("Running process %d", args.local_rank)  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(args))
    '''
    
    args = None
    
    # Initialize distributed training if needed
    distributed = (local_rank != -1)
    if distributed:
        torch.cuda.set_device(local_rank)
        device = torch.device("cuda", local_rank)
        torch.distributed.init_process_group(backend='nccl', init_method='env://')

    #logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning")
    print(f'{datetime.now()}: Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning')
    
    model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    
    # We will use 5 special tokens:
    # - <bos> to indicate the start of the sequence
    # - <eos> to indicate the end of the sequence
    # - <speaker1> to indicate the beginning and the tokens of an utterance from the user
    # - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
    # - <pad> as a padding token to build batches of sequences
    special_tokens = {
        'bos_token': '<bos>',
        'eos_token': '<eos>',
        'additional_special_tokens': ['<speaker1>', '<speaker2>'],
        'pad_token': '<pad>'
    }

    # We can add these special tokens to the vocabulary and the embeddings of the model:
    tokenizer.add_special_tokens(special_tokens)
    #model.config.num_special_tokens = len(special_tokens)
    model.resize_token_embeddings(len(tokenizer))
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr)

    # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last)
    if fp16:
        from apex import amp  # Apex is only required if we use fp16 training
        model, optimizer = amp.initialize(model, optimizer, opt_level=fp16)
    if distributed:
        model = DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank)

    #logger.info("Prepare datasets")
    print(f'{datetime.now()}: prepare datasets')
    train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(tokenizer)

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch = tuple(input_tensor.to(device) for input_tensor in batch)
        
        lm_loss, mc_loss, _, _, _ = model(*batch)
        loss = (lm_loss * lm_coef + mc_loss * mc_coef) / gradient_accumulation_steps
        if fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm)
        else:
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
        if engine.state.iteration % gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()
    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(input_tensor.to(device) for input_tensor in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch
            #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist()))
            print(f'{datetime.now()}: {tokenizer.decode(input_ids[0, -1, :].tolist())}')
            model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids)
            lm_logits, mc_logits = model_outputs[0], model_outputs[1]  # So we can also use GPT2 outputs
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels)
    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, lr), (n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics 
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])),
               "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))}
    #metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args),
    #                "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        tb_logger = TensorboardLogger(log_dir='../logs')
        #tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED)
        #tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        #tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED)

        #checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3)
        #trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, 
        #                          {'mymodel': getattr(model, 'module', model)})  
        # "getattr" take care of distributed encapsulation

        #torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin')
        #getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME))
        #tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=n_epochs)
        #tokenizer.save_vocabulary(tb_logger.writer.log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method)
    #if local_rank in [-1, 0] and n_epochs > 0:
    #    os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME))  # TODO: PR in ignite to have better access to saved file paths (cleaner)
    #    tb_logger.close()


# In[14]:


train(dataset_path='../data/ql_dataset.json', 
    dataset_cache=cached_path('../data/ql_dataset.json'), n_epochs=100, train_batch_size=10, valid_batch_size=10)


# In[22]:


def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k: <=0: no filtering, >0: keep only top k tokens with highest probability.
            top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset
                whose total probability mass is greater than or equal to the threshold top_p.
                In practice, we select the highest probability tokens whose cumulative probability mass exceeds
                the threshold top_p.
            threshold: a minimal threshold to keep logits
Beispiel #15
0
# start training until the epoch finished
train()

# data to predict from shopee
new_test_df = pd.read_csv('../input/test.csv')
new_test_df['review'] = new_test_df['review'].apply(
    removeNumbersAndPunctuations)
new_test_df['review'] = new_test_df['review'].apply(removeSpaces)
new_test_df['review'] = new_test_df['review'].apply(lowerWords)
new_test_df['review'] = new_test_df['review'].apply(removeStopWords)

# path to where you save your model .pth and .bin generated by the above function
model_path = "./tmp_data"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
config = torch.load(
    cached_path(os.path.join(model_path, "model_training_args.bin")))
model = TransformerWithClfHeadAndAdapters(config["config"],
                                          config["config_ft"]).to(device)
state_dict = torch.load(cached_path(
    os.path.join(model_path, "model_weights.pth")),
                        map_location=device)

model.load_state_dict(state_dict)  # Load model state dict
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-cased', do_lower_case=False)  # Load tokenizer

clf_token = tokenizer.vocab['[CLS]']  # classifier token
pad_token = tokenizer.vocab['[PAD]']  # pad token

max_length = config[
    'config'].num_max_positions  # Max length from trained model