def __init__(self, state_dim, act_dim, hidden_size, max_length=None, max_ep_len=4096, action_tanh=True, **kwargs): super().__init__(state_dim, act_dim, max_length=max_length) self.hidden_size = hidden_size config = transformers.GPT2Config( vocab_size=1, # doesn't matter -- we don't use the vocab n_embd=hidden_size, **kwargs) # note: the only difference between this GPT2Model and the default Huggingface version # is that the positional embeddings are removed (since we'll add those ourselves) self.transformer = GPT2Model(config) self.embed_timestep = nn.Embedding(max_ep_len, hidden_size) self.embed_return = torch.nn.Linear(1, hidden_size) self.embed_state = torch.nn.Linear(self.state_dim, hidden_size) self.embed_action = torch.nn.Linear(self.act_dim, hidden_size) self.embed_ln = nn.LayerNorm(hidden_size) # note: we don't predict states or returns for the paper self.predict_state = torch.nn.Linear(hidden_size, self.state_dim) self.predict_action = nn.Sequential( *([nn.Linear(hidden_size, self.act_dim)] + ([nn.Tanh()] if action_tanh else []))) self.predict_return = torch.nn.Linear(hidden_size, 1)
def get_model(tokenizer, resume=False): if cfg('random_init'): # load randomly initialized model instead of pretrained model_config = transformers.GPT2Config() model = transformers.GPT2LMHeadModel(model_config) elif resume: # resume from previous best model = AutoModelForCausalLM.from_pretrained( cfg('out_path') + cfg('name')) else: # load pretrained model model = AutoModelForCausalLM.from_pretrained(cfg('model')) model.resize_token_embeddings(len(tokenizer)) model = model.to(cfg('device')) return model
def load_igpt(model_size, model_path, cluster_path, n_px): """ Load pretrained model and clusters """ if model_size == "l": n_embd, n_head, n_layer = 1536, 16, 48 elif model_size == "m": n_embd, n_head, n_layer = 1024, 8, 36 elif model_size == "s": n_embd, n_head, n_layer = 512, 8, 24 clusters = np.load(cluster_path) # get color clusters vocab_size = len(clusters) + 1 # add one for start of sentence token config = transformers.GPT2Config(vocab_size=vocab_size, n_ctx=n_px*n_px, n_positions=n_px*n_px, n_embd=n_embd, n_layer=n_layer, n_head=n_head) model = ImageGPT2LMHeadModel.from_pretrained(model_path, from_tf=True, config=config) return model, torch.from_numpy(clusters)
def __init__(self, model_name, model_size, models_dir, color_clusters_dir, n_px, **parent_params): """ Parameters ---------- model_name : str A name for this model, used for caching. model_size : str The size of iGPT used - "s" for small, "m" for medium, or "l" for large. The exact parameters are stored in `GPTExtractor.MODELS`. models_dir : str Path to directory with downloaded model. Make sure the params match the downloaded model. color_clusters_dir : str Path to directory with the downloaded color clusters. n_px : int The number of pixels used. All publicly available versions of iGPT are 32x32. parent_params """ super().__init__(model_name, **parent_params) self.n_px = n_px self.model_size = model_size color_clusters_file = "%s/kmeans_centers.npy" % color_clusters_dir self.clusters = np.load(color_clusters_file) # get color clusters n_embd, n_head, n_layer = GPTExtractor.MODELS[ model_size] # set model hyperparameters self.vocab_size = len( self.clusters) + 1 # add one for start of sentence token self.config = transformers.GPT2Config(vocab_size=self.vocab_size, n_ctx=self.n_px * self.n_px, n_positions=self.n_px * self.n_px, n_embd=n_embd, n_layer=n_layer, n_head=n_head) self.model_path = "%s/%s/model.ckpt-1000000.index" % (models_dir, model_size)
def __init__(self, vocab: nnlp.Vocab, n_embd: int = 256, n_layer: int = 4, n_head: int = 4, n_position: int = 128, n_ctx: int = 128): super(GPT2Wrap, self).__init__() config = transformers.GPT2Config(vocab_size=len(vocab), n_embd=n_embd, n_layer=n_layer, n_head=n_head, n_positions=n_position, n_ctx=n_ctx, output_hidden_states=True) self.gpt2_model = transformers.GPT2LMHeadModel(config) self.vocab = vocab self.n_vocab = len(vocab)
def build_clf_model(vocab_size, params): # Create GPT2 languade model configuration clf_config = tm.GPT2Config(vocab_size, params["seqlen"], params["n_ctx"], params["embed"], params["layers"], params["heads"], resid_pdrop=params["drop"], embd_pdrop=params["drop"], attn_pdrop=params["drop"]) # Load pre-trained GPT2 without language model head clf_gpt2 = GPT2Classifier(clf_config) if params["finetune"]: ckpt = tf.train.Checkpoint(net=clf_gpt2) ckpt.restore(tf.train.latest_checkpoint( params["pretr"])).expect_partial() return clf_gpt2
def test(loadtype: LoadType, use_cuda: bool): cfg = transformers.GPT2Config() model = transformers.GPT2Model(cfg) model.eval() torch.set_grad_enabled(False) test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = model.config # use 4 threads for computing turbo_transformers.set_num_threads(4) input_ids = torch.tensor( ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]), dtype=torch.long) start_time = time.time() for _ in range(10): torch_res = model(input_ids) end_time = time.time() print("\ntorch time consum: {}".format(end_time - start_time)) # there are three ways to load pretrained model. if loadtype is LoadType.PYTORCH: # 1, from a PyTorch model, which has loaded a pretrained model tt_model = turbo_transformers.GPT2Model.from_torch(model, test_device) else: raise ("LoadType is not supported") start_time = time.time() for _ in range(10): res = tt_model(input_ids) # sequence_output, pooled_output end_time = time.time() print("\nturbo time consum: {}".format(end_time - start_time)) assert (numpy.max( numpy.abs(res[0].cpu().numpy() - torch_res[0].cpu().numpy())) < 0.1)
def __init__(self, vocab: nnlp.Vocab, n_embd: int = 256, n_layer: int = 2, n_head: int = 2, n_position: int = 128, n_ctx: int = 128, unk_hard_loss: float = -1.0): super(BiGPT2LM, self).__init__() config = transformers.GPT2Config(vocab_size=len(vocab), n_embd=n_embd, n_layer=n_layer, n_head=n_head, n_positions=n_position, n_ctx=n_ctx, output_hidden_states=True) self.gpt2model_fwd = transformers.GPT2LMHeadModel(config) self.gpt2model_rev = transformers.GPT2LMHeadModel(config) self.vocab = vocab self.unk_hard_loss = unk_hard_loss
def create_model(hparams, dictionary): # Config docs: https://huggingface.co/transformers/model_doc/gpt2.html#gpt2config model = transformers.GPT2LMHeadModel( transformers.GPT2Config(vocab_size=len(dictionary), n_embd=hparams["embedding_dim"], n_layer=hparams["n_layer"], n_head=hparams["n_head"], n_positions=hparams['max_seq_length'], n_ctx=hparams['max_seq_length'])) if hparams["load_checkpoint"]: model.load_state_dict( torch.load(hparams["load_checkpoint"], map_location=lambda storage, location: storage)) if hparams["use_multi_gpu"]: assert torch.cuda.device_count() > 1 print("Using %d GPUs" % torch.cuda.device_count()) model = torch.nn.DataParallel(model) optim = torch.optim.Adam(model.parameters(), lr=hparams["lr"]) return model, optim
def __init__( self, tokenizer_model, train_file, valid_file, test_file, from_pretrained=None, block_size=1024, # [Model config] # for small n_layer=12, n_head=12, n_embd=768, # for medium -> n_layer=24, n_head=16, n_embd=1024 # for large -> n_layer=36, n_head=20, n_embd=5120 # for XL -> n_layer=48, n_head=24, n_embd=6400 # [DataLoader options] batch_size=2, prefetch_factor=10, num_workers=1, shuffle_buffer_size=1000, lr=1e-4, num_warmup_steps=0, num_training_steps=None, ): super().__init__() # Load tokenzier tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_model) self._tokenizer = tokenizer # Load or initialize model if from_pretrained: config = transformers.GPT2Config.from_pretrained(from_pretrained) model = transformers.GPT2LMHeadModel.from_pretrained( from_pretrained) else: # Prepare model config = transformers.GPT2Config( vocab_size=len(tokenizer), tokenizer_class=tokenizer.__class__.__name__, bos_token_id=tokenizer.bos_token_id, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, sep_token_id=tokenizer.sep_token_id, cls_token_id=tokenizer.cls_token_id, unk_token_id=tokenizer.unk_token_id, # n_layer=n_layer, n_head=n_head, n_embd=n_embd) model = transformers.GPT2LMHeadModel(config) self.model = model self._config = config self._train_file = train_file self._valid_file = valid_file self._test_file = test_file self._batch_size = batch_size self._prefetch_factor = prefetch_factor self._num_workers = num_workers self._shuffle_buffer_size = shuffle_buffer_size self._lr = lr self._num_warmup_steps = num_warmup_steps self._num_training_steps = num_training_steps
merges_path=MERGES_PATH, data_path=DATA_PATH, seq_len=seq_len) split_ratio = [0.9, 0.1] split_lens = [int(len(dataset) * split_ratio[0]), None] split_lens[1] = len(dataset) - split_lens[0] train_set, valid_set = torch.utils.data.random_split(dataset, split_lens) print("Loading Model...") config = transformers.GPT2Config( vocab_size=261, n_positions=seq_len, n_ctx=seq_len, n_embd=30, n_layer=3, n_head=3 ) model = transformers.GPT2LMHeadModel(config=config) print("Training Model...") writer = SummaryWriter() training_args = transformers.TrainingArguments( output_dir="models/gpt2/", do_train=True, do_eval=True, evaluate_during_training=True,
def main( tokenizer_model, save_model_dir, train_file, valid_file, seed=None, block_size=1024, # [Model config] # for small n_layer=12, n_head=12, n_embd=768, # for medium -> n_layer=24, n_head=16, n_embd=1024 # for large -> n_layer=36, n_head=20, n_embd=5120 # for XL -> n_layer=48, n_head=24, n_embd=6400 # [DataLoader options] batch_size=2, prefetch_factor=10, num_workers=1, shuffle_buffer_size=1000, lr=1e-4, num_warmup_steps=0, num_training_steps=None, # optoins for trainer ds_config: str = None, **train_options): # Set seed if seed: pl.seed_everything(seed) # Load tokenzier tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_model) # Prepare model config = transformers.GPT2Config( vocab_size=len(tokenizer), tokenizer_class=tokenizer.__class__.__name__, bos_token_id=tokenizer.bos_token_id, pad_token_id=tokenizer.pad_token_id, eos_token_id=tokenizer.eos_token_id, sep_token_id=tokenizer.sep_token_id, cls_token_id=tokenizer.cls_token_id, unk_token_id=tokenizer.unk_token_id, # n_layer=n_layer, n_head=n_head, n_embd=n_embd) print(config) # Load data train_dataset = BlockDataset.from_file( block_size=config.n_ctx, tokenizer=tokenizer, filename=train_file, ) valid_dataset = BlockDataset.from_file( block_size=config.n_ctx, tokenizer=tokenizer, filename=valid_file, ) shuffled_train_dataset = torch.utils.data.BufferedShuffleDataset( train_dataset, buffer_size=shuffle_buffer_size, ) # Build DataLoader train_loader = torch.utils.data.DataLoader( dataset=shuffled_train_dataset, batch_size=batch_size, collate_fn=BlockDataset.collate_fn, prefetch_factor=prefetch_factor, num_workers=num_workers, ) valid_loader = torch.utils.data.DataLoader( dataset=valid_dataset, batch_size=batch_size, collate_fn=BlockDataset.collate_fn, prefetch_factor=prefetch_factor, num_workers=num_workers, ) # Trainer print("Training options:", train_options) pl_model = PLModel(config=config, lr=lr, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # Setup callbacks callbacks = [ pl.callbacks.LearningRateMonitor(), ] if "gpus" in train_options: callbacks.append(pl.callbacks.GPUStatsMonitor()) # Setup plugins plugins = [] if ds_config: plugins.append(DeepSpeedPlugin(config=ds_config)) # Trainer trainer = pl.Trainer( **train_options, deterministic=True if seed else False, callbacks=callbacks, ) trainer.fit(model=pl_model, train_dataloader=train_loader, val_dataloaders=valid_loader) pl_model.model.save_pretrained(save_model_dir) tokenizer.save_pretrained(save_model_dir)