def get_and_tokenize_dataset(tokenizer, dataset_dir='wikitext-103', dataset_cache=None, with_labels=False):
    """ Retrieve, tokenize, encode and cache a dataset with optional labels """
    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load encoded dataset from cache at %s", dataset_cache)
        encoded_dataset = torch.load(dataset_cache)
    else:
        # If the dataset is in our list of DATASETS_URL, use this url, otherwise, look for 'train.txt' and 'valid.txt' files
        if dataset_dir in DATASETS_URL:
            dataset_dir = DATASETS_URL[dataset_dir]
        else:
            dataset_dir = {'train': os.path.join(dataset_dir, 'train.txt'),
                           'valid': os.path.join(dataset_dir, 'valid.txt')}

        logger.info("Get dataset from %s", dataset_dir)
        # Download and read dataset and replace a few token for compatibility with the Bert tokenizer we are using
        dataset = {}
        for split_name in ['train', 'valid']:
            dataset_file = cached_path(dataset_dir[split_name])
            with open(dataset_file, "r", encoding="utf-8") as f:
                all_lines = f.readlines()
                dataset[split_name] = [
                    line.strip(' ').replace('\n', '[SEP]').replace('<unk>', '[UNK]') for line in tqdm(all_lines)]

        # Download and read labels if needed, convert labels names to integers
        labels = {}
        if with_labels:
            for split_name in ['train', 'valid']:
                dataset_file = cached_path(dataset_dir['labels'][split_name])
                with open(dataset_file, "r", encoding="utf-8") as f:
                    all_lines = f.readlines()
                    labels[split_name] = [dataset_dir['labels']['convert'][line.strip()] for line in tqdm(all_lines)]

        # Tokenize and encode the dataset
        logger.info("Tokenize and encode the dataset")
        logging.getLogger("pytorch_pretrained_bert.tokenization").setLevel(logging.ERROR)  # No warning on sample size
        def encode(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, encode(o)) for n, o in obj.items())
            return list(encode(o) for o in tqdm(obj))
        encoded_dataset = encode(dataset)

        # Add labels if needed, or if we are doing language modeling, add number of words to get word-level ppl and gather in one list
        for split_name in ['train', 'valid']:
            if with_labels:
                encoded_dataset[split_name + '_labels'] = labels[split_name]
            else:
                encoded_dataset[split_name] = [ind for line in encoded_dataset[split_name] for ind in line]
                encoded_dataset[split_name + '_num_words'] = sum(len(line.split(' ')) for line in dataset[split_name])

        # Save to cache
        if dataset_cache:
            logger.info("Save encoded dataset to cache at %s", dataset_cache)
            torch.save(encoded_dataset, dataset_cache)

    return encoded_dataset
def get_dataset_personalities(tokenizer, dataset_path, dataset_cache=None):
    """ Get personalities from PERSONACHAT """
    dataset_path = dataset_path or PERSONACHAT_URL
    # Do avoid using GPT cache for GPT-2 and vice-versa
    dataset_cache = dataset_cache + '_' + type(tokenizer).__name__
    if os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at {dataset_cache}")
        personachat = torch.load(dataset_cache)
    else:
        logger.info(f"Download PERSONACHAT dataset from {dataset_path}")
        personachat_file = cached_path(dataset_path)
        with open(personachat_file, "r", encoding="utf-8") as f:
            personachat = json.loads(f.read())

        logger.info("Tokenize and encode the dataset")

        def tokenize(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, tokenize(o)) for n, o in obj.items())
            return list(tokenize(o) for o in obj)

        personachat = tokenize(personachat)
        torch.save(personachat, dataset_cache)

    logger.info("Filter personalities")
    personalities = []
    for dataset in personachat.values():
        for dialog in dataset:
            personalities.append(dialog["personality"])

    logger.info("Gathered {} personalities".format(len(personalities)))
    return personalities
def get_dataset_personalities(tokenizer, dataset_path, dataset_cache=None):
    dataset_path  = dataset_path or PERSONACHAT_URL
    dataset_cache = dataset_cache + '_' + type(tokenizer).__name__  # Do avoid using GPT cache for GPT-2 and vice-versa
    
    if os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        personachat = torch.load(dataset_cache)
    else:
        logger.info("Download PERSONACHAT dataset from %s", dataset_path)
        personachat_file = cached_path(dataset_path)
        with open(personachat_file, "r", encoding="utf-8") as f:
            personachat = json.loads(f.read())
            
        logger.info("Tokenize and encode the dataset")
        personachat = tokenize(tokenizer, personachat)
        torch.save(personachat, dataset_cache)
    
    logger.info("Filter personalities")
    personalities = []
    for dataset in personachat.values():
        for dialog in dataset:
            personalities.append(dialog["personality"])
    
    logger.info("Gathered {} personalities".format(len(personalities)))
    return personalities
def get_dataset(tokenizer, dataset_path, dataset_cache=None):
    """ Get PERSONACHAT from S3 """
    dataset_path = dataset_path or PERSONACHAT_URL
    dataset_cache = dataset_cache + '_' + type(
        tokenizer
    ).__name__  # Do avoid using GPT cache for GPT-2 and vice-versa
    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info(f"Load tokenized dataset from cache at {dataset_cache}")
        dataset = torch.load(dataset_cache)
    else:
        logger.info(f"Download dataset from {dataset_path}")
        personachat_file = cached_path(dataset_path)
        with open(personachat_file, "r", encoding="utf-8") as f:
            dataset = json.loads(f.read())

        logger.info("Tokenize and encode the dataset")

        def tokenize(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, tokenize(o)) for n, o in obj.items())
            return list(tokenize(o) for o in obj)

        dataset = tokenize(dataset)
        if dataset_cache:
            torch.save(dataset, dataset_cache)
    return dataset
Exemple #5
0
    def load_pretrained_model(self):

        logger.info("Loading pretrained model")
        state_dict = torch.load(cached_path(
            "https://s3.amazonaws.com/models.huggingface.co/"
            "naacl-2019-tutorial/model_checkpoint.pth"),
                                map_location=self.device)
        self.model.load_state_dict(state_dict, strict=False)
        logger.info("Pretrained model loaded!")
def download_pretrained_model():
    resolved_archive_file = cached_path(HF_FINETUNED_MODEL)
    tempdir = tempfile.mkdtemp()
    
    logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir))
    with tarfile.open(resolved_archive_file, 'r:gz') as archive:
        archive.extractall(tempdir)
    
    return tempdir
Exemple #7
0
def download_targz_to_folder(url):
    """ Download and extract finetuned model from S3 """
    resolved_archive_file = cached_path(url)
    tempdir = tempfile.mkdtemp()

    logger.info("extracting archive file {} to temp dir {}".format(
        resolved_archive_file, tempdir))
    with tarfile.open(resolved_archive_file, "r:gz") as archive:
        archive.extractall(tempdir)
    return tempdir
Exemple #8
0
def get_dataset_ms(tokenizer, dataset_path, dataset_cache=None, mode="train"):
    """ get ms marco """
    if mode == "train":
        dataset_path = dataset_path or MSMARCO_TRAIN_URL
    elif mode == "valid":
        dataset_path = dataset_path or MSMARCO_DEV_URL

    dataset_cache = dataset_cache + 'posttokenization_' + "msmarco_" + mode + type(
        tokenizer
    ).__name__  # Do avoid using GPT cache for GPT-2 and vice-versa
    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        dataset = torch.load(dataset_cache)
        print("dataset loaded")
    else:
        logger.info("Download dataset from %s", dataset_path)
        ms_marco_file = cached_path(dataset_path)

        with gzip.open(ms_marco_file, "rt", encoding="utf-8") as f:
            dataset = json.loads(f.read())

        logger.info("Tokenize and encode the dataset")

        def tokenize(obj):
            global textcounter
            if isinstance(obj, str):
                toks = tokenizer.tokenize(obj)
                if len(toks) > tokenizer.max_len:
                    toks = toks[-tokenizer.max_len:].copy()
                textcounter += 1
                if textcounter % 10000 == 0:
                    print(textcounter)
                    print(obj)
                if textcounter < 100:
                    print(textcounter)
                    print(obj)
                return toks
                # except:
                #     import pdb; pdb.set_trace()
                #return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, tokenize(o)) for n, o in obj.items())
            if isinstance(obj, int):
                return obj
            return list(tokenize(o) for o in obj)

        # dataset = tokenize(dataset)
        # with open(dataset_cache, 'w') as json_file:
        #     json.dump(dataset, json_file)
        # print("json saved")
        # import pdb; pdb.set_trace()
        # if dataset_cache:
        #     torch.save(dataset, dataset_cache, pickle_protocol=3)
        #     print("dataset saved")
    return dataset
Exemple #9
0
def download_pretrained_model():
    """ Download and extract finetuned model from S3 """
    resolved_archive_file = cached_path(HF_FINETUNED_MODEL,
                                        cache_dir='./cache/')
    tempdir = tempfile.mkdtemp()

    logger.info("extracting archive file {} to temp dir {}".format(
        resolved_archive_file, tempdir))
    with tarfile.open(resolved_archive_file, 'r:gz') as archive:
        archive.extractall(tempdir)
    return tempdir
def download_pretrained_model():
    """ Download and extract finetuned model from S3
    
    Returns:
        str -- tempdir: filepath (possibly cached) for loading pre-trained model
    """
    resolved_archive_file = cached_path(HF_FINETUNED_MODEL)
    tempdir = tempfile.mkdtemp()

    # logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir))
    print("extracting archive file {} to temp dir {}".format(
        resolved_archive_file, tempdir))
    with tarfile.open(resolved_archive_file, 'r:gz') as archive:
        archive.extractall(tempdir)

    return tempdir
def load_data_lm():
    dataset_file = cached_path("https://s3.amazonaws.com/datasets.huggingface.co/wikitext-103/"
                               "wikitext-103-train-tokenized-bert.bin")
    datasets = torch.load(dataset_file)

    # Convert our encoded dataset to torch.tensors and reshape in blocks of the transformer's input length
    for split_name in ['train', 'valid']:
        tensor = torch.tensor(datasets[split_name], dtype=torch.long)
        num_sequences = (tensor.size(0) // 256) * 256
        datasets[split_name] = tensor.narrow(0, 0, num_sequences).view(-1, 256)

    n = len(datasets['valid']) // 2
    datasets['test'] = datasets['valid'][n:]
    datasets['valid'] = datasets['valid'][:n]
    datasets['train'] = datasets['train'][:1000]
    return datasets
Exemple #12
0
def download_data(dataset_name):
    """
    下载数据集
    :param dataset_name: 数据集名称。
    :return:
    """
    url = DATASET_ARCHIVE_MAP[dataset_name]
    try:
        resolved_archive_file = cached_path(url)
    except EnvironmentError:
        logger.error("Dataset Download failed!")
        return None

    data_dir = os.path.join('data/raw', dataset_name)
    with ZipFile(resolved_archive_file, 'r') as zipObj:
        data_file_name = list(
            filter(lambda f: f.endswith('.json'), zipObj.namelist()))[0]
        zipObj.extract(data_file_name, data_dir)
        return os.path.join(data_dir, data_file_name)
def get_dataset(tokenizer, dataset_path, dataset_cache=None):
    dataset_path  = dataset_path or PERSONACHAT_URL
    dataset_cache = dataset_cache + '_' + type(tokenizer).__name__  # Do avoid using GPT cache for GPT-2 and vice-versa
    
    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        dataset = torch.load(dataset_cache)
    else:
        logger.info("Download dataset from %s", dataset_path)
        personachat_file = cached_path(dataset_path)
        with open(personachat_file, "r", encoding="utf-8") as f:
            dataset = json.loads(f.read())
        
        logger.info("Tokenize and encode the dataset")
        dataset = tokenize(tokenizer, dataset)
        if dataset_cache:
            torch.save(dataset, dataset_cache)
    
    return dataset
 def from_pretrained(cls,
                     pretrained_model_name_or_path,
                     cache_dir=None,
                     *inputs,
                     **kwargs):
     """
     Instantiate a PreTrainedBertModel from a pre-trained model file.
     Download and cache the pre-trained model file if needed.
     """
     if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
         vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[
             pretrained_model_name_or_path]
     else:
         vocab_file = pretrained_model_name_or_path
     if os.path.isdir(vocab_file):
         vocab_file = os.path.join(vocab_file, VOCAB_NAME)
     # redirect to the cache, if necessary
     try:
         resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
     except EnvironmentError:
         logger.error(
             "Model name '{}' was not found in model name list ({}). "
             "We assumed '{}' was a path or url but couldn't find any file "
             "associated to this path or url.".format(
                 pretrained_model_name_or_path,
                 ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
                 vocab_file))
         return None
     if resolved_vocab_file == vocab_file:
         logger.info("loading vocabulary file {}".format(vocab_file))
     else:
         logger.info("loading vocabulary file {} from cache at {}".format(
             vocab_file, resolved_vocab_file))
     if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
         # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
         # than the number of positional embeddings
         max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[
             pretrained_model_name_or_path]
         kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
     # Instantiate tokenizer.
     tokenizer = cls(resolved_vocab_file, *inputs, **kwargs)
     return tokenizer
    def __init__(self, batch_size: int):
        dataset_file = cached_path("https://s3.amazonaws.com/datasets.huggingface.co/trec/"
                                   "trec-tokenized-bert.bin")
        datasets = torch.load(dataset_file)
        tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

        for split_name in ['train', 'test']:
            # Trim the samples to the transformer's input length minus 1 & add a classification token
            datasets[split_name] = [x[:256 - 1] + [tokenizer.vocab['[CLS]']]
                                    for x in datasets[split_name]]

            # Pad the dataset to max length
            padding_length = max(len(x) for x in datasets[split_name])
            datasets[split_name] = [np.array(x + [tokenizer.vocab['[PAD]']] * (padding_length - len(x)))
                                    for x in datasets[split_name]]

        valid_size = int(0.1 * len(datasets['train']))
        c = list(zip(datasets['train'], datasets['train_labels']))
        random.shuffle(c)
        datasets['train'], datasets['train_labels'] = zip(*c)
        datasets['train'], datasets['train_labels'] = list(datasets['train']), list(datasets['train_labels'])

        datasets['valid'], datasets['valid_labels'] = datasets['train'][:valid_size], datasets['train_labels'][:valid_size]
        datasets['train'], datasets['train_labels'] = datasets['train'][valid_size:], datasets['train_labels'][valid_size:]

        train_df = pd.DataFrame(data={
            "x": datasets['train'],
            "y_target": datasets['train_labels']
        })
        val_df = pd.DataFrame(data={
            "x": datasets['valid'],
            "y_target": datasets['valid_labels']
        })
        test_df = pd.DataFrame(data={
            "x": datasets['test'],
            "y_target": datasets['test_labels']
        })

        super().__init__(train_set=DataFrameDataset(train_df), train_batch_size=batch_size, val_set=DataFrameDataset(val_df), val_batch_size=batch_size,
                         test_set=DataFrameDataset(test_df), test_batch_size=batch_size)
Exemple #16
0
def download_data(dataset_name):
    """
    下载数据集

    :param dataset_name: 数据集名称。
    :return:
    """
    url = DATASET_ARCHIVE_MAP[dataset_name]
    try:
        resolved_archive_file = cached_path(url)
    except EnvironmentError:
        logger.error("Dataset Download failed!")
        return None

    # data_dir = 'data/raw'
    # os.makedirs(data_dir, exist_ok=True)
    data_dir = os.path.join('data/raw', dataset_name)
    with ZipFile(resolved_archive_file, 'r') as zipObj:
        # Extract all the contents of zip file in current directory
        data_file_name = list(filter(lambda f: f.endswith('.json'), zipObj.namelist()))[0]
        zipObj.extract(data_file_name, data_dir)
        return os.path.join(data_dir, data_file_name)
Exemple #17
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='gpt2',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('{} is on use...'.format(device))
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_name,
                                              special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    # model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
    model = GPT2DoubleHeadsModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    #     GPT2DoubleHeadsModel.set_num_special_tokens(model, len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        num_train_optimization_steps = len(
            train_dataloader) * args.num_train_epochs
        optimizer = OpenAIAdam(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
                               weight_decay=args.weight_decay,
                               t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss = model(input_ids, mc_token_ids, lm_labels,
                                   mc_labels)
                _, mc_logits = model(input_ids, mc_token_ids)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
def get_and_tokenize_dataset(tokenizer,
                             dataset_dir='wikitext-103',
                             dataset_cache=None,
                             with_labels=False):
    """ Retrieve, tokenize, encode and cache the dataset """
    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load encoded dataset from cache at %s", dataset_cache)
        encoded_dataset = torch.load(dataset_cache)
    else:
        if dataset_dir in DATASETS_URL:
            dataset_dir = DATASETS_URL[dataset_dir]
        else:
            dataset_dir = {
                'train': os.path.join(dataset_dir, 'train.txt'),
                'valid': os.path.join(dataset_dir, 'valid.txt')
            }
        logger.info("Download dataset from %s", dataset_dir)
        dataset = {}
        for split_name in ['train', 'valid']:
            dataset_file = cached_path(dataset_dir[split_name])
            with open(dataset_file, "r", encoding="utf-8") as f:
                all_lines = f.readlines()
                dataset[split_name] = [idx for line in tqdm(all_lines) \
                    for idx in line.strip(' ').replace('\n', '[SEP]').replace('<unk>', '[UNK]').split(' ')]
        labels = {}
        if with_labels:
            for split_name in ['train', 'valid']:
                dataset_file = cached_path(dataset_dir['labels'][split_name])
                with open(dataset_file, "r", encoding="utf-8") as f:
                    all_lines = f.readlines()
                    labels[split_name] = [
                        dataset_dir['labels']['convert'][line]
                        for line in tqdm(all_lines)
                    ]

        logger.info("Tokenize and encode the dataset")

        def encode(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, encode(o)) for n, o in obj.items())
            return list(encode(o) for o in tqdm(obj))

        encoded_dataset = encode(dataset)

        # Add the number of words and gather in one list
        for split_name in ['train', 'valid']:
            encoded_dataset[split_name] = [
                ind for line in encoded_dataset[split_name] for ind in line
            ]
            encoded_dataset[split_name + '_num_words'] = len(
                dataset[split_name])
            if with_labels:
                encoded_dataset[split_name + '_labels'] = labels[split_name]

        if dataset_cache:
            logger.info("Save encoded dataset to cache at %s", dataset_cache)
            torch.save(encoded_dataset, dataset_cache)

    return encoded_dataset
Exemple #19
0
import json
from pytorch_pretrained_bert import cached_path
from pytorch_pretrained_bert import OpenAIGPTTokenizer
from keras_gpt_2 import load_trained_model_from_checkpoint, get_bpe_from_files, generate

tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
url = "s3://datasets.huggingface.co/personachat/personachat_self_original.json"

# Download and load JSON dataset
personachat_file = cached_path(url)
with open(personachat_file, "r", encoding="utf-8") as f:
    dataset = json.loads(f.read())

# with open('dataset.json', "w", encoding="utf-8") as f:
#     f.write(json.dumps(dataset))
dataset = dataset['train']
dataset = dataset[:1]
print('\n')
print(dataset[0]['utterances'][1])
print('\n')
print(dataset[0]['utterances'][2])


# Tokenize and encode the dataset using our loaded GPT tokenizer
def tokenize(obj):
    if isinstance(obj, str):
        return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
    if isinstance(obj, dict):
        return dict((n, tokenize(o)) for n, o in obj.items())
    return list(tokenize(o) for o in obj)
                "gradient_accumulation_steps, log_dir, dataset_cache",
    defaults   =[410      , 2100      , 256              , 50000         , 10        , 16         ,
                 0.1    , 0.02             , 16         , 2.5e-4, 0.25, 200     , 1000    , "cuda",
                 4                          , "./"   , "./dataset_cache_small_gist"])

# Load a pre-defined tokenizer (BERT), create config and model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
args = Config(num_embeddings=len(tokenizer.vocab), device="cuda" if torch.cuda.is_available() else "cpu")
model = TransformerWithLMHead(args).to(args.device)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

# Download and tokenize wikitext-103 training dataset
if os.path.isfile(args.dataset_cache):
    dataset = torch.load(args.dataset_cache)
else:
    dataset_file = cached_path("https://s3.amazonaws.com/datasets.huggingface.co/wikitext-103/wiki.train.tokens")
    with open(dataset_file, "r", encoding="utf-8") as f:
        dataset = f.readlines()
    dataset = list(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(
                    line.strip(' ').replace('\n', '[SEP]').replace('<unk>', '[UNK]'))) for line in tqdm(dataset))
    dataset = torch.tensor([index for line in dataset for index in line], dtype=torch.long)
    torch.save(dataset, args.dataset_cache)

# Organize the dataset in blocs of num_max_positions tokens for the transformer
num_sequences = (dataset.size(0) // args.num_max_positions) * args.num_max_positions
dataset = dataset.narrow(0, 0, num_sequences).view(-1, args.num_max_positions)
dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)

# Define training function
def update(engine, batch):
    model.train()
Exemple #21
0
def get_dataset(tokenizer, dataset_path, dataset_cache=None):
    """ Get PERSONACHAT from S3 """
    dataset_path = dataset_path or PERSONACHAT_URL
    dataset_cache = dataset_cache + '_' + type(
        tokenizer
    ).__name__  # Do avoid using GPT cache for GPT-2 and vice-versa
    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        dataset = torch.load(dataset_cache)
    else:
        logger.info("Download dataset from %s", dataset_path)
        personachat_file = cached_path(dataset_path, cache_dir='./cache/')
        # personachat_file = cached_path(dataset_path, cache_dir='../../.pytorch_pretrained_bert')
        with open(personachat_file, "r", encoding="utf-8") as f:
            dataset = json.loads(f.read())

        logger.info("Tokenize and encode the dataset")

        def tokenize(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, tokenize(o)) for n, o in obj.items())
            return list(tokenize(o) for o in obj)

        dataset = tokenize(dataset)
        if dataset_cache:
            torch.save(dataset, dataset_cache)
    # FIXME: only for testing, delete later:
    '''
    dataset['train'] = dataset['train'][:1]
    while len(dataset['train'][0]['utterances']) != 1:
        dataset['train'][0]['utterances'].pop()
    # dataset['valid'] = dataset['valid'][:1]
    dataset['valid'] = dataset['train']
    '''
    dataset['train'] = dataset['train'][:int(len(dataset['train']) * 0.9)]
    # dataset['train'] = dataset['train'][:int(len(dataset['train']) * 0.1)]

    # train_len = int(len(dataset['train']) * 0.9)
    # dataset['train'] = dataset['train'][int(len(dataset['train']) * 0.9):]

    # dataset['train'] = dataset['train'][: 1]
    dataset['valid'] = dataset['valid'][:1]  # 这里不要乱改啊!!!
    # dataset['train'] = dataset['train'][:int(len(dataset['train'])*0.9)]
    # dataset['dev'] = dataset['train'][int(len(dataset['train']) * 0.9):]

    personachat_file = cached_path(dataset_path, cache_dir='./cache/')
    # personachat_file = cached_path(dataset_path, cache_dir='../../.pytorch_pretrained_bert')
    with open(personachat_file, "r", encoding="utf-8") as f:
        org_dataset = json.loads(f.read())
    # org_dataset_tmp = org_dataset['train'][train_len:]
    # personas = defaultdict(list)
    for dataset_name in org_dataset:
        for i, dialogue in enumerate(org_dataset[dataset_name]):
            if i >= len(dataset[dataset_name]):
                break
            dataset[dataset_name][i]['persona_org'] = dialogue[
                'personality'].copy()
            '''
            for _ in range(len(dialogue['utterances'])):
                personas[dataset_name].append(dialogue['personality'])
                '''
    return dataset
Exemple #22
0
def train():
    parser = ArgumentParser()
    parser.add_argument("--model_checkpoint",
                        type=str,
                        default=PRETRAINED_MODEL_URL,
                        help="Path to the pretrained model checkpoint")
    parser.add_argument("--dataset_path",
                        type=str,
                        default='trec',
                        help="'imdb', 'trec' or a dict of splits paths.")
    parser.add_argument("--dataset_cache",
                        type=str,
                        default='./dataset_cache_fine_tune_trec',
                        help="Path or url of the dataset cache")

    parser.add_argument("--finetuning_model_class",
                        type=str,
                        default="TransformerWithClfHead",
                        help="Fine-tuning model class for the target task")
    parser.add_argument(
        "--num_classes",
        type=int,
        default=2,
        help="Number of classes for the target classification task")
    parser.add_argument(
        "--adapters_dim",
        type=int,
        default=-1,
        help="If >0 add adapters to the model wtih adapters_dim dimension")

    parser.add_argument("--clf_loss_coef",
                        type=float,
                        default=1,
                        help="If >0 add a classification loss")
    parser.add_argument("--lm_loss_coef",
                        type=float,
                        default=-1,
                        help="If >0 add a language modeling loss")

    parser.add_argument("--train_batch_size",
                        type=int,
                        default=16,
                        help="Batch size for training")
    parser.add_argument("--valid_batch_size",
                        type=int,
                        default=32,
                        help="Batch size for validation")
    parser.add_argument("--lr", type=float, default=6e-5, help="Learning rate")
    parser.add_argument("--n_warmup",
                        type=int,
                        default=500,
                        help="Number of warmup iterations")
    parser.add_argument("--max_norm",
                        type=float,
                        default=0.25,
                        help="Clipping gradient norm")
    parser.add_argument("--weight_decay",
                        type=float,
                        default=0.0,
                        help="Weight decay")
    parser.add_argument("--n_epochs",
                        type=int,
                        default=3,
                        help="Number of training epochs")
    parser.add_argument("--eval_every",
                        type=int,
                        default=100,
                        help="Evaluate every X steps (-1 => end of epoch)")
    parser.add_argument("--gradient_accumulation_steps",
                        type=int,
                        default=1,
                        help="Accumulate gradient")
    parser.add_argument("--initializer_range",
                        type=float,
                        default=0.02,
                        help="Normal initialization standard deviation")

    parser.add_argument("--device",
                        type=str,
                        default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="Local rank for distributed training (-1: not distributed)")
    args = parser.parse_args()

    # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log on main process only, logger.warning => log on all processes
    logging.basicConfig(
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(
        "Running process %d", args.local_rank
    )  # This is a logger.warning: it will be printed by all distributed processes
    logger.info("Arguments: %s", pformat(
        args))  # This is a logger.info: only printed on the first process

    # Initialize distributed training if needed
    args.distributed = (args.local_rank != -1)
    if args.distributed:
        torch.cuda.set_device(args.local_rank)
        args.device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend='nccl',
                                             init_method='env://')

    # Loading tokenizer, pretrained model and optimizer
    logger.info("Prepare tokenizer, model and optimizer")
    tokenizer = BertTokenizer.from_pretrained(
        'bert-base-cased',
        do_lower_case=False)  # Let's use a pre-defined tokenizer

    logger.info("Create model from class %s and configuration %s",
                args.finetuning_model_class,
                os.path.join(args.model_checkpoint, CONFIG_NAME))
    ModelClass = getattr(importlib.import_module("finetuning_model"),
                         args.finetuning_model_class)
    pretraining_args = torch.load(
        cached_path(os.path.join(args.model_checkpoint, CONFIG_NAME)))
    model = ModelClass(config=pretraining_args,
                       fine_tuning_config=args).to(args.device)

    logger.info("Load pretrained weigths from %s",
                os.path.join(args.model_checkpoint, WEIGHTS_NAME))
    state_dict = torch.load(cached_path(
        os.path.join(args.model_checkpoint, WEIGHTS_NAME)),
                            map_location='cpu')
    incompatible_keys = model.load_state_dict(state_dict, strict=False)
    logger.info("Parameters discarded from the pretrained model: %s",
                incompatible_keys.unexpected_keys)
    logger.info("Parameters added in the adaptation model: %s",
                incompatible_keys.missing_keys)
    model.tie_weights()

    optimizer = Adam(model.parameters(),
                     lr=args.lr,
                     weight_decay=args.weight_decay)
    logger.info("Model has %s parameters",
                sum(p.numel() for p in model.parameters() if p.requires_grad))

    # Prepare model for distributed training if needed
    if args.distributed:
        model = DistributedDataParallel(model,
                                        device_ids=[args.local_rank],
                                        output_device=args.local_rank)

    logger.info("Prepare datasets")
    loaders = get_data_loaders(args,
                               tokenizer,
                               pretraining_args.num_max_positions,
                               clf_token=tokenizer.vocab['[CLS]'])
    train_loader, val_loader, train_sampler, valid_sampler = loaders

    # Training function and trainer
    def update(engine, batch):
        model.train()
        batch, labels = (t.to(args.device) for t in batch)
        inputs = batch.transpose(
            0, 1).contiguous()  # to shape [seq length, batch]
        _, (clf_loss, lm_loss) = model(
            inputs,
            clf_tokens_mask=(inputs == tokenizer.vocab['[CLS]']),
            clf_labels=labels,
            lm_labels=inputs,
            padding_mask=(batch == tokenizer.vocab['[PAD]']))
        loss = (max(0, args.clf_loss_coef) * clf_loss + max(
            0, args.lm_loss_coef) * lm_loss) / args.gradient_accumulation_steps
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()

    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch, labels = (t.to(args.device) for t in batch)
            inputs = batch.transpose(
                0, 1).contiguous()  # to shape [seq length, batch]
            _, clf_logits = model(
                inputs,
                clf_tokens_mask=(inputs == tokenizer.vocab['[CLS]']),
                padding_mask=(batch == tokenizer.vocab['[PAD]']))
        return clf_logits, labels

    evaluator = Engine(inference)

    # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed
    trainer.add_event_handler(Events.EPOCH_COMPLETED,
                              lambda _: evaluator.run(val_loader))
    if args.eval_every > 0:
        trainer.add_event_handler(
            Events.ITERATION_COMPLETED,
            lambda engine: evaluator.run(val_loader)
            if engine.state.iteration % args.eval_every == 0 else None)
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED,
                                  lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(
            Events.EPOCH_STARTED,
            lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Learning rate schedule: linearly warm-up to lr and then to zero
    scheduler = PiecewiseLinear(optimizer, 'lr',
                                [(0, 0.0), (args.n_warmup, args.lr),
                                 (len(train_loader) * args.n_epochs, 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we average distributed metrics using average_distributed_scalar
    metrics = {"accuracy": Accuracy()}
    metrics.update({
        "average_accuracy":
        MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)
    })
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train
    if args.local_rank in [-1, 0]:
        checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving(
            trainer,
            evaluator,
            metrics,
            model,
            optimizer,
            args,
            prefix="finetune_")

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

    # On the main process: close tensorboard logger and rename the last checkpoint for easy re-loading
    if args.local_rank in [-1, 0] and args.n_epochs > 0:
        os.rename(
            checkpoint_handler._saved[-1][1][-1],
            os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)
        )  # TODO: PR in ignite to have better access to saved file paths (cleaner)
        tb_logger.close()
    def from_pretrained(cls,
                        pretrained_model_name_or_path,
                        num_special_tokens=None,
                        state_dict=None,
                        cache_dir=None,
                        from_tf=False,
                        *inputs,
                        **kwargs):
        """
        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
        Download and cache the pre-trained model file if needed.

        Params:
            pretrained_model_name_or_path: either:
                - a str with the name of a pre-trained model to load selected in the list of:
                    . `openai-gpt`
                - a path or url to a pretrained model archive containing:
                    . `openai_gpt_config.json` a configuration file for the model
                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
                - a path or url to a pretrained model archive containing:
                    . `config.json` a configuration file for the model
                    . a series of NumPy files containing OpenAI TensorFlow trained weights
            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
            *inputs, **kwargs: additional input for the specific Bert class
                (ex: num_labels for BertForSequenceClassification)
        """
        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[
                pretrained_model_name_or_path]
            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[
                pretrained_model_name_or_path]
        else:
            archive_file = os.path.join(pretrained_model_name_or_path,
                                        WEIGHTS_NAME)
            config_file = os.path.join(pretrained_model_name_or_path,
                                       CONFIG_NAME)
        # redirect to the cache, if necessary
        try:
            resolved_archive_file = cached_path(archive_file,
                                                cache_dir=cache_dir)
            resolved_config_file = cached_path(config_file,
                                               cache_dir=cache_dir)
        except EnvironmentError:
            logger.error(
                "Model name '{}' was not found in model name list ({}). "
                "We assumed '{}' was a path or url but couldn't find files {} and {} "
                "at this path or url.".format(
                    pretrained_model_name_or_path,
                    ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
                    pretrained_model_name_or_path, archive_file, config_file))
            return None
        if resolved_archive_file == archive_file and resolved_config_file == config_file:
            logger.info("loading weights file {}".format(archive_file))
            logger.info("loading configuration file {}".format(config_file))
        else:
            logger.info("loading weights file {} from cache at {}".format(
                archive_file, resolved_archive_file))
            logger.info(
                "loading configuration file {} from cache at {}".format(
                    config_file, resolved_config_file))
        # Load config
        config = OpenAIGPTConfig.from_json_file(resolved_config_file)
        logger.info("Model config {}".format(config))
        # Instantiate model.
        model = cls(config, *inputs, **kwargs)
        if state_dict is None and not from_tf:
            state_dict = torch.load(resolved_archive_file, map_location='cpu')
        if from_tf:
            # Directly load from a TensorFlow checkpoint (stored as NumPy array)
            return load_tf_weights_in_openai_gpt(model, resolved_archive_file)

        old_keys = []
        new_keys = []
        for key in state_dict.keys():
            new_key = None
            if key.endswith(".g"):
                new_key = key[:-2] + ".weight"
            elif key.endswith(".b"):
                new_key = key[:-2] + ".bias"
            elif key.endswith(".w"):
                new_key = key[:-2] + ".weight"
            if new_key:
                old_keys.append(key)
                new_keys.append(new_key)
        for old_key, new_key in zip(old_keys, new_keys):
            state_dict[new_key] = state_dict.pop(old_key)

        missing_keys = []
        unexpected_keys = []
        error_msgs = []
        # copy state_dict so _load_from_state_dict can modify it
        metadata = getattr(state_dict, "_metadata", None)
        state_dict = state_dict.copy()
        if metadata is not None:
            state_dict._metadata = metadata

        def load(module, prefix=""):
            local_metadata = {} if metadata is None else metadata.get(
                prefix[:-1], {})
            module._load_from_state_dict(state_dict, prefix, local_metadata,
                                         True, missing_keys, unexpected_keys,
                                         error_msgs)
            for name, child in module._modules.items():
                if child is not None:
                    load(child, prefix + name + ".")

        start_model = model
        if hasattr(model, "transformer") and all(
                not s.startswith('transformer.') for s in state_dict.keys()):
            start_model = model.transformer
        load(start_model, prefix="")

        if len(missing_keys) > 0:
            logger.info(
                "Weights of {} not initialized from pretrained model: {}".
                format(model.__class__.__name__, missing_keys))
        if len(unexpected_keys) > 0:
            logger.info(
                "Weights from pretrained model not used in {}: {}".format(
                    model.__class__.__name__, unexpected_keys))
        if len(error_msgs) > 0:
            raise RuntimeError(
                "Error(s) in loading state_dict for {}:\n\t{}".format(
                    model.__class__.__name__, "\n\t".join(error_msgs)))

        # Add additional embeddings for special tokens if needed
        # This step also make sure we are still sharing the output and input embeddings after loading weights
        model.set_num_special_tokens(num_special_tokens if num_special_tokens
                                     is not None else config.n_special)
        return model
Exemple #24
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_csqa_dataset(args.train_dataset)

    print("Splitting train 90-10 into train-dev.")
    dev_dataset = train_dataset[int(len(train_dataset) * 0.9):]
    train_dataset = train_dataset[:int(len(train_dataset) * 0.9)]
    test_dataset = load_csqa_dataset(args.eval_dataset)
    datasets = (train_dataset, dev_dataset, test_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the mex input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(
        len(question[:max_length]) +
        max(len(answer1[:max_length]), len(answer2[:max_length]),
            len(answer3[:max_length])) + 3 for dataset in encoded_datasets
        for question, answer1, answer2, answer3, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset = tensor_datasets[0]
    dev_tensor_dataset = tensor_datasets[1]
    test_tensor_dataset = tensor_datasets[2]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    dev_data = TensorDataset(*dev_tensor_dataset)
    dev_sampler = RandomSampler(dev_data)
    dev_dataloader = DataLoader(dev_data,
                                sampler=dev_sampler,
                                batch_size=args.train_batch_size)

    test_data = TensorDataset(*test_tensor_dataset)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data,
                                 sampler=test_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(
        train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        best_dev_accuracy = 0
        test_acc_best_dev = 0
        best_dev_epoch = 0
        no_up = 0
        tqdm_epoch = tqdm(range(args.num_train_epochs), desc="Epoch")
        for epoch in tqdm_epoch:
            model.train()

            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]

                loss.backward()
                optimizer.step()
                optimizer.zero_grad()

                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

            # train_loss, train_accuracy = evaluate(model, device, train_dataloader, desc="Evaluate Train")
            dev_loss, dev_accuracy = evaluate(model,
                                              device,
                                              dev_dataloader,
                                              desc="Evaluate Dev")
            test_loss, test_accuracy = evaluate(model,
                                                device,
                                                test_dataloader,
                                                desc="Evaluate Test")

            train_loss = tr_loss / nb_tr_steps if args.do_train else None

            if dev_accuracy >= best_dev_accuracy:
                # New best model.
                best_dev_accuracy = dev_accuracy
                test_acc_best_dev = test_accuracy
                best_dev_epoch = epoch + 1
                no_up = 0

                # Save the new best model.
                model_to_save = model.module if hasattr(
                    model, 'module') else model  # Only save the model it-self
                output_model_file = os.path.join(args.output_dir,
                                                 "pytorch_model.bin")
                torch.save(model_to_save.state_dict(), output_model_file)
            else:
                no_up += 1

            tqdm.write("\t ***** Eval results (Epoch %s) *****" %
                       str(epoch + 1))
            # tqdm.write("\t train_accuracy = %s" % str(train_accuracy))
            tqdm.write("\t dev_accuracy = %s" % str(dev_accuracy))
            tqdm.write("")
            tqdm.write("\t best_dev_accuracy = %s" % str(best_dev_accuracy))
            tqdm.write("\t test_acc_best_dev = %s" % str(test_acc_best_dev))
            tqdm.write("\t best_dev_epoch = %s" % str(best_dev_epoch))
            tqdm.write("\t no_up = %s" % str(no_up))
            tqdm.write("")

            if no_up >= 10:
                tqdm_epoch.close()
                break