def __init__(self, model_path, generation_type, use_finetuned=True):
        self.model_path = model_path
        self.batch_size = int(args["--batch-size"])

        self.tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

        self.MAX_LEN = {
            GENERATION_TYPE_SMALL: 20,
            GENERATION_TYPE_LARGE: 500
        }[generation_type]
        logger.info(
            f"Using {generation_type} for decoding, MAX_LEN={self.MAX_LEN}")
        if use_finetuned:
            logger.info("Using a finetuned model")
            self.config = GPT2Config.from_pretrained(self.model_path)
            model = GPT2LMHeadModel.from_pretrained(self.model_path)
            with open(f"{self.model_path}/special_tokens_map.json", "r") as f:
                special_tokens = json.load(f)
            self.tokenizer.add_special_tokens(special_tokens)
        else:
            logger.info("NOT using a finetuned model")
            model = GPT2LMHeadModel(config=GPT2Config.from_pretrained(
                pretrained_model_name_or_path=self.model_path))
        self.model = model.cuda()
        self.model.eval()
Esempio n. 2
0
    def __init__(self, config, dataset):
        super(GPT2Seq, self).__init__(config, dataset)
        self.pretrained_model_path = config['pretrained_model_path']
        self.tokenizer = GPT2TokenizerFast.from_pretrained(
            self.pretrained_model_path, pad_token='[PAD]')

        self.configuration = GPT2Config.from_pretrained(
            self.pretrained_model_path, pad_token_id=self.padding_token_idx)

        self.model = GPT2LMHeadModel.from_pretrained(
            self.pretrained_model_path, config=self.configuration)
        self.model.resize_token_embeddings(len(self.tokenizer))

        if config['task_type'] == "summarization":
            self.task_text = "TL;DR:"
        elif config['task_type'] == "translation":
            self.task_text = "story:"
        elif config['task_type'] == "multi_dialog":
            self.task_text = "question:"
        else:
            raise NotImplementedError(
                "Only summarization and translation are supported.")

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
Esempio n. 3
0
    def __init__(self, hparams):
        super().__init__()

        self.hparams = hparams
        self.d = None
        self.tokenizer = None

        # hotfixes
        if 'unfreeze' not in hparams:
            self.hparams.unfreeze = False
        if 'lang' not in hparams:
            self.hparams.lang = 'nld'

        autofix_paths(self.hparams)

        # GPT with LM head and correct embedding size
        with open(Path('data') / self.hparams.lang / 'config.json') as f:
            cfg = json.load(f)

        if self.hparams.unfreeze:
            self.n_unfreeze = 0
            if self.hparams.resume_from_checkpoint is not None:
                print('Resuming from checkpoint: unfreezing all layers')
                self.n_unfreeze = None

        config = GPT2Config.from_pretrained(self.hparams.pretrained_path,
                                            **cfg)
        if self.hparams.unfreeze and self.n_unfreeze is not None:
            config.torchscript = True
        self.m = GPT2LMHeadModel.from_pretrained(self.hparams.pretrained_path,
                                                 config=config)

        # Resize vocab
        self.m.resize_token_embeddings(self.hparams.vocab_size)
Esempio n. 4
0
def load_model(train_steps, num_warmup_steps):
    try:  # try to load finetuned model at local.
        tokenizer = load_tokenizer()
        config = GPT2Config.from_pretrained(configs.model_path,
                                            return_dict=False)
        model = TFGPT2LMHeadModel.from_pretrained(configs.model_path,
                                                  return_dict=False)
        print("model loaded from local!")
    except Exception as e:
        tokenizer = BertTokenizer.from_pretrained(
            "mymusise/gpt2-medium-chinese")
        model = TFGPT2LMHeadModel.from_pretrained(
            "mymusise/gpt2-medium-chinese", return_dict=False)
        print("model loaded from remote!")

    loss = model.compute_loss
    optimizer = nlp.optimization.create_optimizer(
        5e-5, num_train_steps=train_steps, num_warmup_steps=num_warmup_steps)
    metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

    model.compile(
        optimizer=optimizer,
        loss=[loss, *[None] * model.config.n_layer],
        # metrics=[metric]
    )
    return model
Esempio n. 5
0
 def __init__(self, model_path):
     config = GPT2Config.from_pretrained(model_path)
     config.output_hidden_states=True
     config.output_attentions = True
     self.model = GPT2LMHeadModel.from_pretrained(model_path, config=config)
     self.model.eval()
     self.context = ''
Esempio n. 6
0
    def load(cls, pretrained_model_name_or_path, language=None, **kwargs):
        """
        Load a pretrained model by supplying
        * the name of a remote model on s3 ("gpt2" ...)
        * OR a local path of a model trained via transformers ("some_dir/huggingface_model")
        * OR a local path of a model trained via FARM ("some_dir/farm_model")
        :param pretrained_model_name_or_path: The path of the saved pretrained model or its name.
        :type pretrained_model_name_or_path: str
        """

        gpt2 = cls()
        if "farm_lm_name" in kwargs:
            gpt2.name = kwargs["farm_lm_name"]
        else:
            gpt2.name = pretrained_model_name_or_path
        # We need to differentiate between loading model using FARM format and Pytorch-Transformers format
        farm_lm_config = Path(
            pretrained_model_name_or_path) / "language_model_config.json"
        if os.path.exists(farm_lm_config):
            # FARM style
            gpt2_config = GPT2Config.from_pretrained(farm_lm_config)
            farm_lm_model = Path(
                pretrained_model_name_or_path) / "language_model.bin"
            gpt2.model = GPT2Model.from_pretrained(farm_lm_model,
                                                   config=gpt2_config,
                                                   **kwargs)
            gpt2.language = gpt2.model.config.language
        else:
            # Pytorch-transformer Style
            gpt2.model = GPT2Model.from_pretrained(
                str(pretrained_model_name_or_path), **kwargs)
            gpt2.language = cls._get_or_infer_language_from_name(
                language, pretrained_model_name_or_path)
        return gpt2
Esempio n. 7
0
    def __init__(self, train_dataloader, val_dataloader=None):
        """
        Initialises Trainer by defining model and GPU

        Args:
        train_dataloader: torch.utils.data.DataLoader
            Dataloader to train model upon, obtained from Dataloader class
        val_dataloader: Optional torch.utils.data.DataLoader
            Dataloader to validate model upon obtained from DataLoader class,
            not required if Trainer is only used for final training
        """

        # Create GPT2 Config
        config = GPT2Config.from_pretrained("gpt2")

        # Load language head model and input default config
        model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)

        # Recreate tokenizer
        tokenizer = GPT2Tokenizer.from_pretrained('gpt2',
                                                  bos_token='<|startoftext|>',
                                                  eos_token='<|endoftext|>',
                                                  pad_token='<|pad|>')

        # Tell model we have added bos, eos, pad token
        model.resize_token_embeddings(len(tokenizer))

        # Tell pytorch to run this model on the GPU.
        device = torch.device("cuda")
        model.cuda()

        self.model = model
        self.device = device
        self.train_dataloader = train_dataloader
        self.val_dataloader = val_dataloader
Esempio n. 8
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--config_path",
                        default="../../models/gpt2/gpt2-config.json",
                        type=str,
                        required=False)
    parser.add_argument("--model_path",
                        default="../../models/gpt2/gpt2-pytorch_model.bin",
                        type=str,
                        required=False)
    parser.add_argument("--vocab_path",
                        default="../../models/gpt2/gpt2-vocab.json",
                        type=str,
                        required=False)
    parser.add_argument("--merges_path",
                        default="../../models/gpt2/gpt2-merges.txt",
                        type=str,
                        required=False)
    parser.add_argument(
        "--sentence",
        default="In this article, I am excited to take you through",
        type=str,
        required=False)
    args = parser.parse_args()

    config = GPT2Config.from_pretrained(args.config_path)
    model = GPT2LMHeadModel.from_pretrained(args.model_path, config=config)
    tokenizer = GPT2Tokenizer(args.vocab_path, args.merges_path)
    # logging.basicConfig(filename="default.txt", level=logging.DEBUG, filemode='w')
    # gpt2_generate_greedy(model, tokenizer, sentence=sys.argv[1])
    gpt2_generate_beam_search(model, tokenizer, sentence=args.sentence)
Esempio n. 9
0
    def __init__(self, config, dataset):
        super(GPT2, self).__init__(config, dataset)

        self.pretrained_model_path = config['pretrained_model_path']
        self.tokenizer = GPT2Tokenizer.from_pretrained(
            self.pretrained_model_path,
            bos_token=dataset.sos_token,
            eos_token=dataset.eos_token,
            pad_token=dataset.padding_token)

        self.sos_token = self.tokenizer.bos_token
        self.eos_token = self.tokenizer.eos_token
        self.sos_token_idx = self.tokenizer.bos_token_id
        self.eos_token_idx = self.tokenizer.eos_token_id
        self.padding_token_idx = self.tokenizer.pad_token_id
        self.max_seq_length = config['max_seq_length']

        self.configuration = GPT2Config.from_pretrained(
            self.pretrained_model_path,
            bos_token_id=self.sos_token_idx,
            eos_token_id=self.eos_token_idx,
            pad_token_id=self.padding_token_idx)

        self.decoder = GPT2LMHeadModel.from_pretrained(
            self.pretrained_model_path, config=self.configuration)
        self.decoder.resize_token_embeddings(len(self.tokenizer))

        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
Esempio n. 10
0
def load_model(checkpoint_dir):
    ckpt_dir = Path(checkpoint_dir)
    config = GPT2Config.from_pretrained(ckpt_dir / "config.json")
    tokenizer = GPT2Tokenizer.from_pretrained(str(ckpt_dir))
    model = GPT2TANDAModel(config)

    model.load_state_dict(torch.load(ckpt_dir / "pytorch_model.bin"))
    model.eval()
    return model, tokenizer
Esempio n. 11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--hidden_layer_num', type=int,
                        help="Number 0..48 of the layer to get hidden states from")
    parser.add_argument('--batch_size', type=int, default=32)

    args = parser.parse_args()

    tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
    config = GPT2Config.from_pretrained('gpt2-medium',
                                        output_hidden_states=True)
    gpt2 = GPT2Model.from_pretrained('gpt2-medium', config=config).cuda()
    logging.getLogger("transformers.tokenization_utils").setLevel(
        logging.ERROR)

    for subsample in ["train", "test"]:
        if not os.path.isdir(subsample):
            os.mkdir(subsample)

        df = pd.read_csv('{}.csv'.format(subsample))
        if os.path.isfile(f'{subsample}_tokens_gpt2.pkl'):
            print("Loading token ids...", file=sys.stderr)
            tokens = joblib.load(f'{subsample}_gpt2.pkl')
        else:
            print("Transforming texts to token ids...", file=sys.stderr)
            tokens = [tokenizer.encode(x) for x in tqdm(df.texts)]
            joblib.dump(tokens, f'{subsample}_gpt2.pkl')
        dataset = DiscourseDataset(tokens, pad_token_id=0, max_len=config.n_positions)
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size)
        gpt2.eval()
        mean_results, max_results = list(),  list()
        with torch.no_grad():
            for num, (token_ids, attention_ids) in enumerate(tqdm(dataloader), 1):
                _, _, hidden_states = gpt2(token_ids,
                                           attention_mask=attention_ids)
                hidden_states_cpu = [x.cpu().numpy() for x in hidden_states]
                del hidden_states
                gc.collect()

                output = hidden_states_cpu[args.hidden_layer_num]
                del hidden_states_cpu

                sentence_lens = attention_ids.sum(1).cpu().numpy()

                output_zero_padding = output.transpose([2, 0, 1]) * attention_ids.cpu().numpy()
                output_zero_padding = output_zero_padding.transpose([1, 2, 0])

                mean_result = (output_zero_padding.sum(1).T / sentence_lens).T
                max_result = np.array([matrix[:length].max(0) for matrix, length in zip(output_zero_padding, sentence_lens)])

                mean_results.append(mean_result)
                max_results.append(max_result)

                torch.cuda.empty_cache()

        np.save(f'{subsample}/gpt2_mean_embeddings_layer_{args.hidden_layer_num}', np.vstack(mean_results))
        np.save(f'{subsample}/gpt2_max_embeddings_layer_{args.hidden_layer_num}', np.vstack(max_results))
    def __init__(self):
        self.batch_size = int(args["--batch-size"])

        self.config = GPT2Config.from_pretrained("gpt2-medium")

        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

        model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
        self.model = model.cuda()
        self.model.eval()
Esempio n. 13
0
    def __init__(self,
                 max_output_length=25,
                 max_input_length=300,
                 device='cpu',
                 tokenizer_type='gpt2',
                 bpe_model="",
                 starter_model=None):
        if tokenizer_type == "gpt2":
            self.tokenizer = utils_tokenizer.GPT2Tokenizer()
            config = GPT2Config.from_pretrained("gpt2")

        elif tokenizer_type == "bpecap":
            self.tokenizer = utils_tokenizer.BPETokenizer(bpe_model)
            config = GPT2Config.from_dict({
                "finetuning_task":
                None,
                "initializer_range":
                0.02,
                "layer_norm_epsilon":
                1e-05,
                "n_ctx":
                1024,
                "n_embd":
                768,
                "n_head":
                12,
                "n_layer":
                12,
                "n_positions":
                1024,
                "num_labels":
                1,
                "resid_pdrop":
                0.1,
                "use_bfloat16":
                False,
                "vocab_size":
                self.tokenizer.vocab_size
            })
        else:
            print("Tokenizer unrecognized. Should be gpt2 or bpecap.")
            exit()

        self.model = GPT2LMHeadModel(config)

        self.model.to(device)
        self.device = device
        if starter_model is not None:
            self.reload(starter_model)

        self.max_output_length = max_output_length
        self.max_input_length = max_input_length

        self.model.train()
        self.mode = "train"
Esempio n. 14
0
 def __init__(self, model_name: str) -> None:
     super().__init__()
     config = GPT2Config.from_pretrained(model_name)
     self.input_dim = config.hidden_size
     self.output_dim = config.vocab_size
     # TODO(mattg): It's possible that we could use some kind of cache like we have in
     # allennlp.modules.token_embedders.bert_token_embedder.PretrainedBertModel.  That way, we
     # would only load the GPT2 weights once.  Though, it's not clear how to do that here, as we
     # need to load `GPT2LMHeadModel`, not just `GPT2Model`...
     gpt2_model = GPT2LMHeadModel.from_pretrained(model_name)
     self.gpt2_lm_head = gpt2_model.lm_head
 def test_train_with_configs(self):
     MODEL_ID = "sshleifer/tiny-gpt2"
     config = GPT2Config.from_pretrained(MODEL_ID)
     benchmark_args = PyTorchBenchmarkArguments(models=[MODEL_ID],
                                                training=True,
                                                no_inference=True,
                                                sequence_lengths=[8],
                                                batch_sizes=[1])
     benchmark = PyTorchBenchmark(benchmark_args, configs=[config])
     results = benchmark.run()
     self.check_results_dict_not_empty(results.time_train_result)
     self.check_results_dict_not_empty(results.memory_train_result)
Esempio n. 16
0
    def load(self, checkpoint_path):
        """
        Load the model, etc
        """
        logging.info("Loading model")
        config = GPT2Config.from_pretrained(checkpoint_path)
        model = GPT2SegmentedModel.from_pretrained(
            checkpoint_path, config=config, cache_dir=self.args.cache_dir)

        if torch.cuda.is_available():
            model = model.cuda()

        self.model = StaticDataParallel(model)
 def __init__(self,
              vilbert,
              gpt2_tokenizer,
              gpt2_embed_dim=768,
              config=None):
     nn.Module.__init__(self)
     self.gpt2_tokenizer = gpt2_tokenizer
     self.gpt2_embed_dim = gpt2_embed_dim
     self.embed = torch.nn.Linear(config.bi_hidden_size,
                                  self.gpt2_embed_dim)
     self.gpt2_config = GPT2Config.from_pretrained('gpt2')
     self.gpt2_model = GPT2LMHeadModel.from_pretrained(
         'gpt2', from_tf=False, config=self.gpt2_config)
     self.vilbert_model = vilbert
Esempio n. 18
0
    def __init__(self, model_path):
        self.model_path = model_path
        self.batch_size = int(args["--batch-size"])
        self.config = GPT2Config.from_pretrained(self.model_path)
        with open(f"{self.model_path}/special_tokens_map.json", "r") as f:
            special_tokens = json.load(f)
            #  special_tokens["pad_token"] = Gpt2Generator.PAD_TOKEN

        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
        self.tokenizer.add_special_tokens(special_tokens)

        model = GPT2LMHeadModel.from_pretrained(self.model_path)
        self.model = model.cuda()
        self.model.eval()
Esempio n. 19
0
    def __init__(self, tokenizer, gpt2_config, segment=True):
        config = GPT2Config.from_pretrained(gpt2_config)
        super(GPT2Summ, self).__init__(config)
        self.transformer = GPT2Model.from_pretrained(gpt2_config)
        self.transformer.resize_token_embeddings(len(tokenizer))
        self.user_id = [
            tokenizer.convert_tokens_to_ids('<user1>'),
            tokenizer.convert_tokens_to_ids('<user2>')
        ]
        self.know_id = tokenizer.convert_tokens_to_ids('<knowledge>')
        self.segment = segment

        self.lm_head = nn.Linear(config.n_embd, len(tokenizer), bias=False)
        self.config.vocab_size = len(tokenizer)
        self.tie_weights()
Esempio n. 20
0
def build_model(args):
    if args.pretrained_path == '':
        config = GPT2Config.from_json_file(args.model_config)
        model = GPT2LMHeadModel(config)
        tokenizer = BertTokenizerFast(args.vocab)
        # XXX: must add this, or can't tokenize special token in string to single char
        tokenizer.sanitize_special_tokens()
        info = None
    else:
        config = GPT2Config.from_pretrained(args.pretrained_path)
        model, info = GPT2LMHeadModel.from_pretrained(args.pretrained_path,
                                                      config=config,
                                                      output_loading_info=True)
        tokenizer = BertTokenizerFast.from_pretrained(args.pretrained_path)
    return model, tokenizer, info
Esempio n. 21
0
def test_language_generate_greedy(max_len=64):
    config = GPT2Config.from_pretrained("./models/gpt2/config.json")
    model = GPT2LMHeadModel.from_pretrained("./models/gpt2/pytorch_model.bin",
                                            config=config)
    token_list = [3, 5, 2]
    input = torch.tensor(token_list).unsqueeze(0)
    model.eval()
    with torch.no_grad():
        for i in range(max_len):
            output = model(input)[0]
            output_id = int(output.max(2)[1][0, -1])
            token_list.append(output_id)
            input = torch.tensor(token_list[-24:]).unsqueeze(0)
            if output_id == 0:
                break
    print(token_list)
Esempio n. 22
0
    def _initialize(self):
        """
        Load the dataset, model, etc
        """
        cache_dir = self.args.cache_dir
        model_name = self.args.model.model_name

        logging.info("Loading dataset")
        self.dataset = StoriumDataset("train", "gpt2", cache_dir=cache_dir)
        self.dataset.load(self.args.data_dir)

        # By default the config outputs "past", but that makes our chunked
        # scattering (needed when batching based on tokens, rather than
        # examples) fail since the huggingface/transformers package stacks the
        # outputs on dim 0, which is normally the batch dimension. This leads
        # to errors like:
        #
        # RuntimeError: Gather got an input of invalid size: got [2, 5, 12,
        #   411, 64], but expected [2, 4, 12, 411, 64] (gather at
        #   /pytorch/torch/csrc/cuda/comm.cpp:226)
        #
        # During training we only care about the loss, so just disable all
        # additional outputs.
        config = GPT2Config.from_pretrained(model_name, cache_dir=cache_dir)
        config.output_hidden_states = False
        config.output_attentions = False
        config.output_past = False

        model = GPT2SegmentedModel.from_pretrained(model_name,
                                                   config=config,
                                                   cache_dir=cache_dir)

        tokenizer = self.dataset.get_tokenizer()
        model.resize_token_embeddings(len(tokenizer))

        max_steps = self.args.optim.max_steps
        optimizer = AdamW(model.parameters(), lr=self.args.optim.lr)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_training_steps=max_steps,
            num_warmup_steps=self.args.optim.warmup_steps,
        )

        # Track the modules
        self.modules["model"] = model
        self.modules["optimizer"] = optimizer
        self.modules["scheduler"] = scheduler
Esempio n. 23
0
def main():
    device = torch.device("cuda")
    args = parser.parse_args()
    args.n_gpu = torch.cuda.device_count()

    args.device = device

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()

    config = GPT2Config.from_pretrained(args.model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(args.model_path,
                                              do_lower_case=False)
    if args.block_size <= 0:
        args.block_size = tokenizer.max_len_single_sentence
    args.block_size = min(args.block_size, tokenizer.max_len_single_sentence)
    model = GPT2LMHeadModel.from_pretrained(args.model_path, config=config)
    model.to(args.device)

    if args.local_rank == 0:
        torch.distributed.barrier()

    logger.info("Training/evaluation parameters %s", args)

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()

    train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False)

    if args.local_rank == 0:
        torch.distributed.barrier()

    run_train(args, train_dataset, model, tokenizer)

    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
            os.makedirs(args.output_dir)

        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
Esempio n. 24
0
    def __init__(self):
        # tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained(GPT2_TYPE)

        # special tokens
        self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        self.tokenizer.add_special_tokens(
            {'additional_special_tokens': dataset_tokens})

        # chess tokens
        self.tokenizer.add_tokens(get_chess_tokens())

        # model
        self.configuration = GPT2Config.from_pretrained(GPT2_TYPE)
        self.model = GPT2LMHeadModel.from_pretrained(
            GPT2_TYPE, config=self.configuration).cuda()

        self.model.resize_token_embeddings(len(self.tokenizer))
Esempio n. 25
0
def get_bert_config(bert_model_type, output_hidden_states=False):
    if bert_model_type in [
            'bert-base-uncased', 'prod-bert-base-uncased', 'bert-base-cased',
            'bert-large-uncased', 'tune_bert-base-uncased_nsp',
            'bert-large-uncased-whole-word-masking',
            'bert-large-uncased-whole-word-masking-finetuned-squad'
    ]:
        bert_config = BertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'roberta-base', 'prod-roberta-base-cased', 'roberta-large',
            'roberta-large-mnli', 'distilroberta-base'
    ]:
        bert_config = RobertaConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['xlnet-base-cased']:
        bert_config = XLNetConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'albert-base-v1', 'albert-large-v1', 'albert-xlarge-v1',
            'albert-xxlarge-v1'
    ]:
        bert_config = AlbertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['gpt2', 'gpt2-medium']:
        bert_config = GPT2Config.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in ['transfo-xl']:
        bert_config = TransfoXLConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    elif bert_model_type in [
            'distilbert-base-uncased',
            'distilbert-base-uncased-distilled-squad'
    ]:
        bert_config = DistilBertConfig.from_pretrained(
            BERT_CONFIG_FILE[bert_model_type])
    else:
        raise ValueError(
            f'`bert_model_type` not understood: {bert_model_type}')

    bert_config.output_hidden_states = output_hidden_states
    return bert_config
Esempio n. 26
0
def get_model(extra_args):
    if extra_args.use_english_weights:
        model = GPT2LMHeadModel.from_pretrained(extra_args.identifier)
    else:
        model = GPT2LMHeadModel(
            GPT2Config.from_pretrained(extra_args.identifier))

    wte = model.transformer.wte
    if extra_args.wte_path is not None:
        wte.weight = nn.Parameter(torch.load(extra_args.wte_path))
    else:
        mean, std = wte.weight.mean().item(), wte.weight.std().item()
        wte.weight = nn.Parameter(
            torch.normal(mean, std, size=wte.weight.size()))

    # tie input and output embeddings
    model.lm_head.weight = model.transformer.wte.weight
    model.tie_weights()

    return model
def load_finetuned_gpt2(weight_path: str, zero_shot: bool = False):
    # Load the model
    if zero_shot:
        model = GPT2LMHeadModel.from_pretrained('gpt2')
    else:
        config = GPT2Config.from_pretrained('gpt2')
        model = GPT2LMHeadModel(config)
        print('{} defined'.format(model.__class__.__name__, ))

        model_weights = torch.load(
            weight_path,
            map_location=lambda storage, loc: storage)['state_dict']
        corrected_model_weights = {}
        for k, v in model_weights.items():
            corrected_model_weights[k.replace('model.', '')] = v
        print('Loaded model weights from {}'.format(weight_path))

        model.load_state_dict(corrected_model_weights, strict=True)
        print('{} loaded with checkpoint weights and sent to GPU!'.format(
            model.__class__.__name__, ))

    return model
def load_dialogpt_zeroshot(weight_path: str):
    # Create object
    config = GPT2Config.from_pretrained('gpt2')
    model = GPT2LMHeadModel(config)
    print('{} defined'.format(model.__class__.__name__, ))

    # Obtain weights from file
    model_weights = torch.load(weight_path,
                               map_location=lambda storage, loc: storage)

    # Load model weights
    model_weights = {
        k.replace('module.', ''): v
        for k, v in model_weights.items()
    }
    if 'lm_head.decoder.weight' in model_weights:
        model_weights['lm_head.weight'] = model_weights.pop(
            'lm_head.decoder.weight'
        )  # Compatibility with newer versions of `transformers` package

    model.load_state_dict(model_weights, strict=True)
    print('Model loaded from {}'.format(weight_path))

    return model
Esempio n. 29
0
epochs = 1
learning_rate = 1e-5
# warmup_steps = 1e2
epsilon = 1e-8

# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

SAVE_PATH = "/mnt/nfs/work1/llcao/zonghaiyao/LM/"

# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained(
    'gpt2', pad_token='<|endoftext|>')  #gpt2-medium

# instantiate the model
# model = rerankGPT2LMHeadModel_stage1_all_tokens_stage2_all_tokens.from_pretrained("/mnt/nfs/work1/llcao/zonghaiyao/LM/results/stage1_all_tokens_start_after_finetuning/stage1_all_tokens_stage2_all_tokens/lr5e4_add_scrach_bs24/200000",
#                                                                              config=configuration,
#                                                                              MAX_LEN = MAX_LEN,
#                                                                              CAN_NUM = CAN_NUM,
#                                                                              num_of_rerank = num_of_rerank)
model = rerankGPT2LMHeadModel_stage1_all_tokens_stage2_all_tokens.from_pretrained(
    "gpt2",
    config=configuration,
    MAX_LEN=MAX_LEN,
Esempio n. 30
0
def fine_tune_gpt2():
  # I'm not really doing anything with the config buheret
  configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

  # instantiate the model
  model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

  # this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
  # otherwise the tokenizer and model tensors won't match up
  model.resize_token_embeddings(len(tokenizer))

  # Tell pytorch to run this model on the GPU.
  device = torch.device("cuda")
  model.cuda()

  # Set the seed value all over the place to make this reproducible.
  seed_val = 42

  random.seed(seed_val)
  np.random.seed(seed_val)
  torch.manual_seed(seed_val)
  torch.cuda.manual_seed_all(seed_val)



  # some parameters I cooked up that work reasonably well

  epochs = 5
  learning_rate = 5e-4
  warmup_steps = 1e2
  epsilon = 1e-8

  # this produces sample output every 100 steps
  sample_every = 100

  # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
  optimizer = AdamW(model.parameters(),
                    lr = learning_rate,
                    eps = epsilon
                  )

  # Total number of training steps is [number of batches] x [number of epochs]. 
  # (Note that this is not the same as the number of training samples).
  total_steps = len(train_dataloader) * epochs

  # Create the learning rate scheduler.
  # This changes the learning rate as the training loop progresses
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = warmup_steps, 
                                              num_training_steps = total_steps)


  def format_time(elapsed):
      return str(datetime.timedelta(seconds=int(round((elapsed)))))



  total_t0 = time.time()

  training_stats = []

  model = model.to(device)

  for epoch_i in range(0, epochs):

      # ========================================
      #               Training
      # ========================================

      print("")
      print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
      print('Training...')

      t0 = time.time()

      total_train_loss = 0

      model.train()

      for step, batch in enumerate(train_dataloader):

          b_input_ids = batch[0].to(device)
          b_labels = batch[0].to(device)
          b_masks = batch[1].to(device)

          model.zero_grad()        

          outputs = model(  b_input_ids,
                            labels=b_labels, 
                            attention_mask = b_masks,
                            token_type_ids=None
                          )

          loss = outputs[0]  

          batch_loss = loss.item()
          total_train_loss += batch_loss

          # Get sample every x batches.
          if step % sample_every == 0 and not step == 0:

              elapsed = format_time(time.time() - t0)
              print('  Batch {:>5,}  of  {:>5,}. Loss: {:>5,}.   Elapsed: {:}.'.format(step, len(train_dataloader), batch_loss, elapsed))

              model.eval()

              sample_outputs = model.generate(
                                      bos_token_id=random.randint(1,30000),
                                      do_sample=True,   
                                      top_k=50, 
                                      max_length = 200,
                                      top_p=0.95, 
                                      num_return_sequences=1
                                  )
              for i, sample_output in enumerate(sample_outputs):
                    print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))
              
              model.train()

          loss.backward()

          optimizer.step()

          scheduler.step()

      # Calculate the average loss over all of the batches.
      avg_train_loss = total_train_loss / len(train_dataloader)       
      
      # Measure how long this epoch took.
      training_time = format_time(time.time() - t0)

      print("")
      print("  Average training loss: {0:.2f}".format(avg_train_loss))
      print("  Training epoch took: {:}".format(training_time))
          
      # ========================================
      #               Validation
      # ========================================

      print("")
      print("Running Validation...")

      t0 = time.time()

      model.eval()

      total_eval_loss = 0
      nb_eval_steps = 0

      # Evaluate data for one epoch
      for batch in validation_dataloader:
          
          b_input_ids = batch[0].to(device)
          b_labels = batch[0].to(device)
          b_masks = batch[1].to(device)
          
          with torch.no_grad():        

              outputs  = model(b_input_ids, 
  #                            token_type_ids=None, 
                              attention_mask = b_masks,
                              labels=b_labels)
            
              loss = outputs[0]  
              
          batch_loss = loss.item()
          total_eval_loss += batch_loss        

      avg_val_loss = total_eval_loss / len(validation_dataloader)
      
      validation_time = format_time(time.time() - t0)    

      print("  Validation Loss: {0:.2f}".format(avg_val_loss))
      print("  Validation took: {:}".format(validation_time))

      # Record all statistics from this epoch.
      training_stats.append(
          {
              'epoch': epoch_i + 1,
              'Training Loss': avg_train_loss,
              'Valid. Loss': avg_val_loss,
              'Training Time': training_time,
              'Validation Time': validation_time
          }
      )

  print("")
  print("Training complete!")
  print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))