def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
    # Construct model
    if openai_config_file == "":
        config = OpenAIGPTConfig()
    else:
        config = OpenAIGPTConfig(openai_config_file)
    model = OpenAIGPTModel(config)

    # Load weights from numpy
    load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path)

    # Save pytorch-model
    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
    torch.save(model.state_dict(), pytorch_weights_dump_path)
    print("Save configuration file to {}".format(pytorch_config_dump_path))
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
        f.write(config.to_json_string())
import math
import torch
import os
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
from pytorch_pretrained_bert.modeling import BertConfig, WEIGHTS_NAME, CONFIG_NAME
from pytorch_pretrained_bert.modeling_openai import OpenAIGPTConfig, WEIGHTS_NAME, CONFIG_NAME

model_path = 'openai-gpt'
output_dir = './language-quality-subreward/gpt_output'
WEIGHTS_NAME = 'pytorch_model.bin'
special_tokens = ['_start_', '_delimiter_', '_classify_']
# Load pre-trained model (weights)
with torch.no_grad():
    output_config_file = os.path.join(output_dir, CONFIG_NAME)
    config = OpenAIGPTConfig(output_config_file)

    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
    model_state_dict = torch.load(output_model_file, map_location='cpu')
    model = OpenAIGPTLMHeadModel(config)
    model.load_state_dict(model_state_dict)

    # model = OpenAIGPTLMHeadModel.from_pretrained(model_path)
    # model.load_state_dict(torch.load(output_model_file, map_location='cpu'))
    model.eval()
    # Load pre-trained model tokenizer (vocabulary)
    tokenizer = OpenAIGPTTokenizer.from_pretrained(model_path, cache_dir='./tmp/', special_tokens=special_tokens)

'''
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
model.eval()
# Load pre-trained model tokenizer (vocabulary)
コード例 #3
0
    def __init__(self, args, tokenizer):

        self.args = args

        self.nli_tokenizer = BertTokenizer.from_pretrained(
            args.bert_model,
            do_lower_case=args.do_lower_case,
            cache_dir='.pytorch_pretrained_bert')
        self.output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        self.output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        self.nli_config = BertConfig(self.output_config_file)
        self.nli_model = BertForSequenceClassification(self.nli_config,
                                                       num_labels=3)
        self.nli_model.load_state_dict(
            torch.load(self.output_model_file,
                       map_location=torch.device('cpu')))
        self.nli_model.to(args.device)
        self.nli_model.eval()

        if args.nli_uu_reward or args.nli_allres_reward:
            uu_output_config_file = os.path.join(args.uu_output_dir,
                                                 CONFIG_NAME)
            uu_output_model_file = os.path.join(args.uu_output_dir,
                                                WEIGHTS_NAME)
            self.uu_nli_config = BertConfig(uu_output_config_file)
            self.uu_nli_model = BertForSequenceClassification(
                self.uu_nli_config, num_labels=3)
            self.uu_nli_model.load_state_dict(
                torch.load(uu_output_model_file,
                           map_location=torch.device('cpu')))
            self.uu_nli_model.to(args.device)
            self.uu_nli_model.eval()

        bert_emb_modelpath = "bert-base-uncased"
        self.bert_emb_tokenizer = BertTokenizer.from_pretrained(
            bert_emb_modelpath, cache_dir='.pytorch_pretrained_bert')
        self.bert_emb_model = BertModel.from_pretrained(
            bert_emb_modelpath,
            cache_dir='.pytorch_pretrained_bert').to(args.device)
        self.bert_emb_model.eval()

        self.tokenizer = tokenizer

        if args.lm_reward:
            lm_model_path = 'openai-gpt'
            lm_output_dir = 'language-quality-subreward/gpt_output'
            lm_special_tokens = ['_start_', '_delimiter_', '_classify_']
            # Load pre-trained model (weights)
            with torch.no_grad():
                lm_output_config_file = os.path.join(lm_output_dir,
                                                     CONFIG_NAME)
                lm_config = OpenAIGPTConfig(lm_output_config_file)

                lm_output_model_file = os.path.join(lm_output_dir,
                                                    WEIGHTS_NAME)
                #lm_model_state_dict = torch.load(lm_output_model_file)
                lm_model_state_dict = torch.load(lm_output_model_file,
                                                 map_location='cpu')
                self.lm_model = OpenAIGPTLMHeadModel(lm_config)
                self.lm_model.load_state_dict(lm_model_state_dict)

                # Load pre-trained model tokenizer (vocabulary)
                self.lm_tokenizer = OpenAIGPTTokenizer.from_pretrained(
                    lm_model_path,
                    special_tokens=lm_special_tokens,
                    cache_dir='.pytorch_pretrained_bert')

            self.special_tokens_ids = list(
                self.lm_tokenizer.convert_tokens_to_ids(token)
                for token in lm_special_tokens)
            self.lm_model.to(args.device)
            self.lm_model.eval()
コード例 #4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset',
                        type=str,
                        default='./train_recipes.json')
    parser.add_argument('--eval_dataset',
                        type=str,
                        default='./val_recipes.json')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=10)
    parser.add_argument('--train_batch_size', type=int, default=2)
    parser.add_argument('--eval_batch_size', type=int, default=2)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-6)
    parser.add_argument('--warmup_proportion', type=float, default=0.1)
    parser.add_argument('--lr_schedule', type=str, default='warmup_cosine')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    print(args)

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    config = OpenAIGPTConfig()
    #model = OpenAIGPTLMHeadModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
    model = OpenAIGPTLMHeadModel(config)
    model.set_num_special_tokens(len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    '''
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)
    '''
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_recipes_dataset(args.train_dataset)
    train_dataset = train_dataset

    #remove extra length train data

    print(train_dataset[0])
    eval_dataset = load_recipes_dataset(args.eval_dataset)
    print(len(eval_dataset))
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    selected_train_data = []
    print(len(encoded_datasets[0]))
    for ins in encoded_datasets[0]:
        if len(ins) <= 510:
            selected_train_data.append(ins)

    encoded_datasets[0] = selected_train_data

    print(len(encoded_datasets[0]))

    # Compute the mex input length for the Transformer
    max_length = model.config.n_positions - 2
    print(max_length)
    print(encoded_datasets[0][0])
    input_length = max(
        len(story[:max_length]) + 2 for dataset in encoded_datasets
        for story in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model
    print(input_length)
    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset = tensor_datasets[0]
    eval_tensor_dataset = tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(
        train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    print(.002 * num_train_optimization_steps)

    total_loss = 0
    total_length = 0

    print(model.transformer.h)
    '''
    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.eval()
        
        tr_loss = 0
        nb_tr_steps = 0
        tqdm_bar = tqdm(train_dataloader, desc="Pre LM training train data ppl")
        for step, batch in enumerate(tqdm_bar):
            #print(batch)
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels = batch
            loss = model(input_ids, lm_labels = lm_labels)
            lengths = mc_token_ids.to('cpu').numpy()
            #print(np.sum(lengths))
            total_loss+=loss.item()*np.sum(lengths)
            total_length+=np.sum(lengths)

    print(total_loss/total_length)

    total_loss = 0
    total_length = 0
    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.eval()
    
        tr_loss = 0
        nb_tr_steps = 0
        tqdm_bar = tqdm(eval_dataloader, desc="Pre LM training val data ppl")
        for step, batch in enumerate(tqdm_bar):
            #print(batch)
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels = batch
            loss = model(input_ids, lm_labels = lm_labels)
            lengths = mc_token_ids.to('cpu').numpy()
            #print(np.sum(lengths))
            total_loss+=loss.item()*np.sum(lengths)
            total_length+=np.sum(lengths)

    print(total_loss/total_length)
    '''
    if args.do_train:
        print("=" * 80 + '\n')
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                #print(batch)
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels = batch
                loss = model(input_ids, lm_labels=lm_labels)

                loss.backward()
                optimizer.step()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

            total_loss = 0
            total_length = 0
            if args.do_train:
                nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
                model.eval()

                tr_loss = 0
                nb_tr_steps = 0
                tqdm_bar = tqdm(train_dataloader,
                                desc="Post LM training train data ppl")
                for step, batch in enumerate(tqdm_bar):
                    #print(batch)
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, mc_token_ids, lm_labels = batch
                    loss = model(input_ids, lm_labels=lm_labels)
                    lengths = mc_token_ids.to('cpu').numpy()
                    #print(np.sum(lengths))
                    total_loss += loss.item() * np.sum(lengths)
                    total_length += np.sum(lengths)

            print(total_loss / total_length)

            total_loss = 0
            total_length = 0
            if args.do_train:
                nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
                model.eval()

                tr_loss = 0
                nb_tr_steps = 0
                tqdm_bar = tqdm(eval_dataloader,
                                desc="Post LM training val data ppl")
                for step, batch in enumerate(tqdm_bar):
                    #print(batch)
                    batch = tuple(t.to(device) for t in batch)
                    input_ids, mc_token_ids, lm_labels = batch
                    loss = model(input_ids, lm_labels=lm_labels)
                    lengths = mc_token_ids.to('cpu').numpy()
                    #print(np.sum(lengths))
                    total_loss += loss.item() * np.sum(lengths)
                    total_length += np.sum(lengths)

            print(total_loss / total_length)

            print("=" * 80 + '\n')
    # Save a trained model
    '''
def main():
    # Pre-train model: eval_ppl = 104.29582476475977
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_name',
                        type=str,
                        default='openai-gpt',
                        help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    # args = parser.parse_args()
    args = parser.parse_args([  #'--do_train',
        '--do_eval', '--dataset=../data/convai2/train_both_original.txt',
        '--dataset=data/convai2/convai2_data.models',
        '--output_dir=./language-quality-subreward/gpt_output/'
    ])
    print(args)

    # This commented code was used for parsing and pickling data from the original data file.
    '''
    data = Parser(persona_limit=None, set_relation=None)
    print('Parsing...')
    data.parse(args.dataset)
    file_utils.save_model('data/convai2', data, '.models', 'convai2_data')
    '''
    data = file_utils.read_model('', args.dataset, '')
    data = list(chain(*data.conversation))
    #data = data[: 10]
    train_data_org = data[:int(0.9 * len(data))]
    eval_data_org = data[int(0.9 * len(data)):]
    del data
    print('')

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, cache_dir="./cache/", special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTLMHeadModel.from_pretrained(
        args.model_name,
        cache_dir="./cache/",
        num_special_tokens=len(special_tokens))
    model.to(device)
    '''
    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)
    '''
    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = train_data_org
    eval_dataset = eval_data_org

    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(sent[:max_length]) + 2  \
                           for dataset in encoded_datasets for sent in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_optimization_steps = len(
        train_data) * args.num_train_epochs // args.train_batch_size
    optimizer = OpenAIAdam(optimizer_grouped_parameters,
                           lr=args.learning_rate,
                           warmup=args.warmup_proportion,
                           max_grad_norm=args.max_grad_norm,
                           weight_decay=args.weight_decay,
                           t_total=num_train_optimization_steps)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, lm_labels = batch
                loss = model(input_ids, lm_labels=lm_labels)
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    optimizer.get_lr()[0])

    # Save a trained model
    if args.do_train:
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
        config = model.config
        torch.save(model_to_save.state_dict(), output_model_file)

        # Yue: save the config:
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

        # Load a trained model that you have fine-tuned
        '''
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTLMHeadModel(config)
        model.load_state_dict(model_state_dict)
        model.to(device)
        '''

    if args.do_eval:
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        config = OpenAIGPTConfig(output_config_file)

        # Load a trained model that you have fine-tuned
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        model_state_dict = torch.load(output_model_file)
        model = OpenAIGPTLMHeadModel(config)
        model.load_state_dict(model_state_dict)
        model.to(device)

        model.eval()

        eval_ppl = 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, lm_labels = batch
            loss = model(input_ids, lm_labels=lm_labels)
            eval_ppl += math.exp(loss.item())
            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1
        eval_ppl = eval_ppl / nb_eval_steps
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {'eval_ppl': eval_ppl, 'train_loss': train_loss}

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))