def load(cls, pretrained_model_name_or_path, language=None, **kwargs): """ Load a pretrained model by supplying * the name of a remote model on s3 ("gpt2" ...) * OR a local path of a model trained via transformers ("some_dir/huggingface_model") * OR a local path of a model trained via FARM ("some_dir/farm_model") :param pretrained_model_name_or_path: The path of the saved pretrained model or its name. :type pretrained_model_name_or_path: str """ gpt2 = cls() if "farm_lm_name" in kwargs: gpt2.name = kwargs["farm_lm_name"] else: gpt2.name = pretrained_model_name_or_path # We need to differentiate between loading model using FARM format and Pytorch-Transformers format farm_lm_config = Path( pretrained_model_name_or_path) / "language_model_config.json" if os.path.exists(farm_lm_config): # FARM style gpt2_config = GPT2Config.from_pretrained(farm_lm_config) farm_lm_model = Path( pretrained_model_name_or_path) / "language_model.bin" gpt2.model = GPT2Model.from_pretrained(farm_lm_model, config=gpt2_config, **kwargs) gpt2.language = gpt2.model.config.language else: # Pytorch-transformer Style gpt2.model = GPT2Model.from_pretrained( str(pretrained_model_name_or_path), **kwargs) gpt2.language = cls._get_or_infer_language_from_name( language, pretrained_model_name_or_path) return gpt2
def __init__( self, config, class_labels, pretrained_model_path, dropout=0.1, freeze_pretrained_part=True, reinitialize=False, n_layers=6, ): super().__init__(config, class_labels) if reinitialize: logger.info('resetting model weights') config = GPT2Config.from_json_file(pretrained_model_path + '/config.json') config = config.to_dict() config['n_layer'] = n_layers config = GPT2Config.from_dict(config) self.gpt2 = GPT2Model(config) else: self.gpt2 = GPT2Model.from_pretrained(pretrained_model_path) self.dropout = torch.nn.Dropout(dropout) self.fc = torch.nn.Linear(self.gpt2.config.n_embd, self.output_dim) if freeze_pretrained_part: for param in self.gpt2.parameters(): param.requires_grad = False
def __init__(self, config): # NPI added functionality super(GPT2WithNPI, self).__init__(config) # NPI added functionality # self.npi = npi # NPI added functionality # self.prediction_indices = prediction_indices # NPI added functionality GPT2Model.__init__(self, config) # NPI added functionality pass
def build_models(text_encoder_type): # build model ############################################################ text_encoder_type = text_encoder_type.casefold() if text_encoder_type not in ('rnn', 'transformer'): raise ValueError('Unsupported text_encoder_type') if text_encoder_type == 'rnn': text_encoder = RNN_ENCODER(dataset.n_words, nhidden=cfg.TEXT.EMBEDDING_DIM) image_encoder = CNN_ENCODER(cfg.TEXT.EMBEDDING_DIM) labels = Variable(torch.LongTensor(range(batch_size))) start_epoch = 0 if cfg.TRAIN.NET_E: if text_encoder_type == 'rnn': state_dict = torch.load(cfg.TRAIN.NET_E) text_encoder.load_state_dict(state_dict) elif text_encoder_type == 'transformer': text_encoder = GPT2Model.from_pretrained(cfg.TRAIN.NET_E) # output_hidden_states = True ) print('Load ', cfg.TRAIN.NET_E) # name = cfg.TRAIN.NET_E.replace('text_encoder', 'image_encoder') state_dict = torch.load(name) image_encoder.load_state_dict(state_dict) print('Load ', name) istart = cfg.TRAIN.NET_E.rfind('_') + 8 iend = cfg.TRAIN.NET_E.rfind('.') start_epoch = cfg.TRAIN.NET_E[istart:iend] start_epoch = int(start_epoch) + 1 else: if text_encoder_type == 'rnn': print('Training RNN from scratch') elif text_encoder_type == 'transformer': # don't initialize the weights of these huge models from scratch... print('Training Transformer starting from pretrained model') text_encoder = GPT2Model.from_pretrained(TRANSFORMER_ENCODER) # output_hidden_states = True ) print('Training CNN starting from ImageNet pretrained Inception-v3') print('start_epoch', start_epoch) if cfg.CUDA: text_encoder = text_encoder.cuda() image_encoder = image_encoder.cuda() labels = labels.cuda() return text_encoder, image_encoder, labels, start_epoch
def __init__(self, config, num_output_labels=4): config.output_attentions = True super(GPT2ClassificationModel, self).__init__(config) self.transformer = GPT2Model(config) self.CNN_Max = nn.Sequential( # Defining a 2D convolution layer nn.Conv2d(1, 4, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(4), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), # Defining another 2D convolution layer nn.Conv2d(4, 4, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(4), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), ) self.CNN_Avg = nn.Sequential( # Defining a 2D convolution layer nn.Conv2d(1, 4, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(4), nn.ReLU(inplace=True), nn.AvgPool2d(kernel_size=2, stride=2), # Defining another 2D convolution layer nn.Conv2d(4, 4, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(4), nn.ReLU(inplace=True), nn.AvgPool2d(kernel_size=2, stride=2), ) self.ff_layers = nn.Sequential(nn.Linear(256, 10), nn.Linear(10, num_output_labels)) self.final_softmax = nn.Softmax(dim=1) self.init_weights()
def __init__(self, hidden_size: int, num_classes:int ,max_seq_len:int, gpt_model_name:str, cache_dir:str): super(SimpleGPT2SequenceClassifier,self).__init__() self.gpt2model = GPT2Model.from_pretrained( gpt_model_name, cache_dir = cache_dir ) self.fc1 = nn.Linear(hidden_size, num_classes)
def create_and_check_gpt2_weight_initialization(self, config, *args): model = GPT2Model(config) model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer) for key in model.state_dict().keys(): if "c_proj" in key and "weight" in key: self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001) self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
def test_openai_gpt2(): from transformers import GPT2Model, GPT2Tokenizer input_text = "Here is some text to encode" pt_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") pt_model = GPT2Model.from_pretrained("gpt2", return_dict=True) pt_outputs = pt_model(**pt_tokenizer([input_text], return_tensors="pt")) task = build_task({ "class": "lm", "params": { "data_pipeline.class": "GPT2DataPipeline", "max_len": 50, "begin_of_sentence": "eos" } }) model_cfgs = get_hyper_parameters("gpt2_117m") model = task.build_model(model_cfgs) restore_checkpoint_if_possible_v2(model, "117M", model_name="OpenAIGPT2") input_ids = task._data_pipeline.process(input_text) tf_inputs = { "trg_input": tf.convert_to_tensor([input_ids], tf.int64), "trg_length": tf.convert_to_tensor([len(input_ids)], tf.int64) } _, gen_init = model.get_symbols_to_logits_fn(tf_inputs, is_training=False, is_inference=False) tf_outputs = model.get_decoder_output(gen_init["decoder_input"], cache=gen_init["decoder_internal_cache"], is_training=False) assert_equal_numpy(pt_outputs.last_hidden_state.detach().numpy(), tf_outputs[:, :-1].numpy(), 5e-4)
def from_pretrained(self, args): # loading from pre-trained encoder_path = args.output_dir + "/encoder/" decoder_path = args.output_dir + "/decoder/" vae_path = args.output_dir + "/vae/vae.weights" tokenizer_path = args.output_dir + "/tokenizer/" logger.info("gpt2_config: " + str(self.gpt2_config)) self.gpt2_config.vocab_size = self.gpt2_config.vocab_size + 2 self.encoder = GPT2Model.from_pretrained( encoder_path, from_tf=bool('.ckpt' in encoder_path), config=self.gpt2_config) self.decoder = GPT2LMHeadModel.from_pretrained( decoder_path, from_tf=bool('.ckpt' in decoder_path), config=self.gpt2_config) self.tokenizer = GPT2Tokenizer.from_pretrained( tokenizer_path, do_lower_case=args.do_lower_case) self.vae.load_state_dict(torch.load(vae_path)) # set up for evaluating self.encoder.eval() self.decoder.eval() self.vae.eval() # load training args training_args = torch.load( os.path.join(args.output_dir, 'training_args.bin')) logger.info("training_args: " + str(training_args)) return
def __init__(self, config, **kwargs): super().__init__(config) self.args = kwargs['args'] self.config = config # core gpt2 and lm head self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) # mention detection output index self.mc_cl2idx = {'<N>': 0, '<M>': 1, '</M>': 2} self.mc_idx2cl = {v: k for k, v in self.mc_cl2idx.items()} self.cl_head = nn.Linear(config.n_embd, 3) # head for 3 classes in mention dection # attention parameters in coref2qr mechanism if self.args.coref_attn_share_between_layer: self.c_attn = Conv1D(3 * config.n_embd, config.n_embd) else: self.c_attn = nn.ModuleList([ Conv1D(3 * config.n_embd, config.n_embd) for _ in range(self.config.n_layer + 1) ]) # binary classification for rewriting or not if self.args.use_binary_cls: self.binary_cls1 = nn.Linear(config.n_embd, config.n_embd) self.binary_cls2 = nn.Linear( config.n_embd, 2, bias=False) # output layer for rewrite or not self.init_weights()
def initialize_model(self, args): # load pretrained model and tokenizer for GPT2 encoder and decoder encoder_path = args.gpt2_model_name_or_path decoder_path = args.gpt2_model_name_or_path tokenizer_path = args.gpt2_model_name_or_path self.encoder = GPT2Model.from_pretrained( encoder_path, from_tf=bool('.ckpt' in encoder_path), config=self.gpt2_config) self.decoder = GPT2LMHeadModel.from_pretrained( decoder_path, from_tf=bool('.ckpt' in decoder_path), config=self.gpt2_config) self.tokenizer = GPT2Tokenizer.from_pretrained( tokenizer_path, do_lower_case=args.do_lower_case) # add [SOS] and [PAD] to tokenizer self.tokenizer.add_special_tokens( {"additional_special_tokens": ["[PAD]", "[SOS]"]}) self.encoder.resize_token_embeddings(len(self.tokenizer)) self.decoder.resize_token_embeddings(len(self.tokenizer)) logger.info("tokenizer size: " + str(self.tokenizer.__len__())) logger.info("tokenizer.decode [50256, 50257, 50258]: " + str(self.tokenizer.decode([50256, 50257, 50258]))) # No controled initialization for VAE logger.info("cautions: no init VAE") return
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformers = GPT2Model(config) self.dropout = nn.Dropout(0.1) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights()
def create_and_check_gpt2_model_past_large_inputs( self, config, input_ids, input_mask, head_mask, token_type_ids, *args ): model = GPT2Model(config=config) model.to(torch_device) model.eval() # first forward pass outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) output, past = outputs.to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size) # append to next input_ids and token_type_ids next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"] self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1]) # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() # test that outputs are equal for slice self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def get_attentions(): model_name = request.args.get('model') source = request.args.get('source') target = request.args.get('target') if model_name == 'XLM': model_version = 'xlm-mlm-ende-1024' model = XLMModel.from_pretrained(model_version, output_attentions=True) tokenizer = XLMTokenizer.from_pretrained(model_version) elif model_name == 'GPT-2': model_version = 'gpt2' model = GPT2Model.from_pretrained(model_version, output_attentions=True) tokenizer = GPT2Tokenizer.from_pretrained(model_version) else: # BERT model_version = 'bert-base-uncased' model = BertModel.from_pretrained(model_version, output_attentions=True) tokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=True) inputs = tokenizer.encode_plus(source, target, return_tensors='pt', add_special_tokens=True) token_type_ids = inputs['token_type_ids'] input_ids = inputs['input_ids'] attention = model(input_ids, token_type_ids=token_type_ids)[-1] input_id_list = input_ids[0].tolist() tokens = tokenizer.convert_ids_to_tokens(input_id_list) return {'attention': format_attention(attention)[0].tolist(), 'source': tokens, 'target': tokens}
def __init__(self, config): super(HFGpt, self).__init__(config) args = self.args self.hidden_dim = args["hidden_dim"] self.num_layers = 1 # Needed for initalize_h() self.batch_size = config["processor"]["params"]["batch_size"] self.encoders = config["processor"]["params"]["label_encoder"] self.num_classes = config["processor"]["params"]["num_classes"] self.num_outputs = len(self.num_classes) self.teacher_enforced = args["teacher_enforced"] self.in_seq_len = args["inp_seq_len"] self.out_seq_len = args["out_seq_len"] self.vocab_size = args["vocab_size"] self.model_name_or_path = args["model_name_or_path"] self.initializer_range = args["initializer_range"] self.logger.debug(self.args) # Shared for all input self.encoder_decoder = GPT2Model.from_pretrained( self.model_name_or_path) # For each output self.out_decoder = torch.nn.ModuleList() for i in range(self.num_outputs): clss = torch.nn.Linear(self.hidden_dim, self.num_classes[i]) # Common init way in most sota models clss.weight.data.normal_(mean=0.0, std=self.initializer_range) self.out_decoder.append(clss) # Print statistics self.initialize()
def __init__(self, cfg, clf_token, task_head_type, vocab=40990, n_ctx=512): super(DoubleHeadModel, self).__init__() #self.transformer = TransformerModel(cfg, vocab=vocab, n_ctx=n_ctx) self.transformer = GPT2Model.from_pretrained('gpt2') self.lm_head = LMHead(self.transformer, cfg) if isinstance(task_head_type, str): if task_head_type == 'multiple_choice': self.task_head = MultipleChoiceHead(clf_token, cfg) elif task_head_type == 'similarity': self.task_head = SimilarityHead(clf_token, cfg) elif task_head_type == 'inference': # the three classes correspond to entailment, contradiction and neutral. self.task_head = ClfHead(clf_token, cfg, 3) else: raise ValueError( "task_head_type is expected to be 'multiple_choice' " "'similarity', 'inference' or ('classification', n_class) " "got {task_head_type}.") elif isinstance(task_head_type, collections.abc.Sequence) and len(task_head_type) == 2 and \ task_head_type[0] == 'classification': n_class = task_head_type[1] self.task_head = ClfHead(clf_token, cfg, n_class) else: raise ValueError( "task_head_type is expected to be 'multiple_choice' " "'similarity', 'inference' or ('classification', n_class) " "got {task_head_type}.")
def __init__(self, freeze_bert, tokenizer, device, bidirectional): super(GPT2LSTMLogRegCRF, self).__init__() #Instantiating BERT model object self.gpt2_layer = GPT2Model.from_pretrained('gpt2', output_hidden_states=True, output_attentions=False) #Freeze bert layers: if True, the freeze BERT weights if freeze_bert: for p in self.gpt2_layer.parameters(): p.requires_grad = False self.tokenizer = tokenizer self.device = device self.bidirectional = bidirectional self.dropout = nn.Dropout(0.5) # lstm layer self.lstm_layer = nn.LSTM(input_size=768, hidden_size = 512, num_layers = 1, bidirectional=bidirectional, batch_first=True) # log reg if bidirectional == True: self.hidden2tag = nn.Linear(1024, clf_P_num_labels) self.hidden2tag_fine = nn.Linear(1024, clf_P_fine_num_labels) else: self.hidden2tag = nn.Linear(512, clf_P_num_labels) self.hidden2tag_fine = nn.Linear(512, clf_P_fine_num_labels) # crf (coarse) self.crf_layer = CRF(clf_P_num_labels, batch_first=True) # crf (fine) self.crf_layer_fine = CRF(clf_P_fine_num_labels, batch_first=True)
def main(): options = parse_args() input_path = Path(options.input) if input_path.is_dir(): input_path = input_path / "model.pt" checkpoint = torch.load(input_path, map_location="cpu") converted_state = { rename_key(key): reshape_weight(key, value) for key, value in checkpoint["state_dict"].items() } gpt2 = GPT2Model.from_pretrained("gpt2-medium") # The vocab is smaller than the actual gpt2 one, therefore it is padded with zeros # to match it. The zeros will be unused. gpt2_vocab_size = gpt2.wte.weight.size(0) vocab_size = converted_state["wte.weight"].size(0) pad_size = gpt2_vocab_size - vocab_size converted_state["wte.weight"] = F.pad( converted_state["wte.weight"], [0, 0, 0, pad_size], mode="constant", value=0.0 ) # There are some weights that are not in the pre-trained model, which will be # trained in the down stream task. As long as no key from the pre-trained model did # not match one of the actual keys, it should be fine. incompatible_keys = gpt2.load_state_dict(converted_state, strict=False) assert ( len(incompatible_keys.unexpected_keys) == 0 ), "Unexpected keys in the model: {}".format(incompatible_keys.unexpected_keys) gpt2.save_pretrained(options.output)
def __init__(self, config): print("************ THIS MODEL COMES FROM CS224N PROJECT ************") super().__init__(config) self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights()
def __init__(self, config: Munch): r""" Init a new GPT2 synapse module. Args: config (:obj:`munch.Munch`, `required`): munched config class. """ super(GPT2LMSynapse, self).__init__(config=config) if config == None: config = GPT2LMSynapse.build_config() # Build hugging face config. huggingface_config = GPT2Config( vocab_size=bittensor.__vocab_size__, n_embd=bittensor.__network_dim__, n_layer=config.synapse.n_layer, n_head=config.synapse.n_head, n_inner=config.synapse.n_inner, activation_function=config.synapse.activation_function, resid_pdrop=config.synapse.resid_pdrop, embd_pdrop=config.synapse.embd_pdrop, attn_pdrop=config.synapse.attn_pdrop, layer_norm_epsilon=config.synapse.layer_norm_epsilon, initializer_range=config.synapse.initializer_range, summary_type=config.synapse.summary_type, summary_use_proj=config.synapse.summary_use_proj, summary_activation=config.synapse.summary_activation, summary_proj_to_labels=config.synapse.summary_proj_to_labels, summary_first_dropout=config.synapse.summary_first_dropout, ) # encoder_layer: encodes tokenized sequences to network dim. # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__] self.transformer = GPT2Model(huggingface_config) # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query. # [batch_size, bittensor.__network_dim__, sequence_len] -> [batch_size, bittensor.__network_dim__] self.pooler = GPT2Pooler(huggingface_config) # router: (PKM layer) queries network using pooled embeddings as context. # [batch_size, bittensor.__network_dim__] -> topk * [batch_size, bittensor.__network_dim__] self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # hidden_layer: transforms context and encoding to network_dim hidden units. # [batch_size, sequence_dim, 2 * bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__] self.hidden_layer = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # target_layer: maps from hidden layer to vocab dimension for each token. Used by MLM loss. # [batch_size, sequence_len, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__vocab_size__] self.target_layer = nn.Linear(bittensor.__network_dim__, bittensor.__vocab_size__, bias=False) # Loss function: MLM cross-entropy loss. # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1] self.loss_fct = nn.CrossEntropyLoss() self.to(self.device)
def create_and_check_gpt2_model_attention_mask_past( self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = GPT2Model(config=config) model.to(torch_device) model.eval() # create attention mask attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) half_seq_length = self.seq_length // 2 attn_mask[:, half_seq_length:] = 0 # first forward pass output, past = model(input_ids, attention_mask=attn_mask).to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) # change a random masked slice from input_ids random_seq_idx_to_change = ids_tensor( (1, ), half_seq_length).item() + 1 random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1) input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens # append to next input_ids and attn_mask next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) attn_mask = torch.cat( [ attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device) ], dim=1, ) # get two different outputs output_from_no_past = model( next_input_ids, attention_mask=attn_mask)["last_hidden_state"] output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"] # select random slice random_slice_idx = ids_tensor((1, ), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach( ) output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() # test that outputs are equal for slice self.parent.assertTrue( torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.v_head = ValueHead(config) self.init_weights()
def from_pretrained(model_id_or_path: str, device: Optional[torch.device] = None, backend: Optional[str] = None): torch_model = TorchGPT2Model.from_pretrained(model_id_or_path) model = GPT2Model.from_torch(torch_model, device, backend) model.config = torch_model.config model._torch_model = torch_model # prevent destroy torch model. return model
def __init__(self, config): super().__init__(config) # config.num_labels = 1 config.num_labels = le.classes_.shape[0] self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) self.init_weights()
def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.cls_head = SequenceSummary(config) self.init_weights()
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, full, gpt2_config_file, pytorch_dump_folder_path): #putting requirements here so users can see usage info before it errors out on missing modules from io import open from shutil import copyfile import logging logging.basicConfig(level=logging.INFO) from pathlib import Path import torch #WEIGHTS_NAME = "pytorch_model.bin" #CONFIG_NAME = "config.json" from transformers import ( CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2, ) gpt2_checkpoint_path = Path(gpt2_checkpoint_path) print(gpt2_checkpoint_path.name) if pytorch_dump_folder_path == '': prefix = '32BIT-' if full else '16BIT-' pytorch_dump_folder_path = 'pytorch-' + prefix + gpt2_checkpoint_path.name pytorch_dump_folder_path = Path(pytorch_dump_folder_path) pytorch_dump_folder_path.mkdir(exist_ok=True) # Construct model if gpt2_config_file == "": #This doesn't seem to work. We will use the hparams.json file that seems to be included in #config = GPT2Config() gpt2_config_file = gpt2_checkpoint_path / 'hparams.json' config = GPT2Config.from_json_file(gpt2_config_file) model = GPT2Model(config) # Load weights from numpy load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) if not full: model.half() # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path / WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path / CONFIG_NAME print("Save PyTorch model to {}".format(str(pytorch_weights_dump_path))) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to: " + str(pytorch_config_dump_path)) with pytorch_config_dump_path.open("w", encoding="utf-8") as f: f.write(config.to_json_string()) copyfile(gpt2_checkpoint_path / 'vocab.bpe', pytorch_dump_folder_path / 'merges.txt') copyfile(gpt2_checkpoint_path / 'encoder.json', pytorch_dump_folder_path / 'vocab.json')
def __init__(self): super(GPT24QUAC, self).__init__() ''' Load the pre-trained GPT2 Language Model Head Model ''' self.gpt2 = GPT2Model.from_pretrained("gpt2") self.config = self.gpt2.config self.head = nn.Linear(self.config.n_embd, 2, bias=True) self.loss_func = nn.CrossEntropyLoss()
def __init__(self, hidden_size: int, num_classes: int): super(GPT2ForTokenClassification, self).__init__() self.gpt2model = GPT2Model.from_pretrained('gpt2') # GPT2ClassificationHead() self.num_labels = num_classes self.fc1 = nn.Linear(hidden_size, num_classes)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--hidden_layer_num', type=int, help="Number 0..48 of the layer to get hidden states from") parser.add_argument('--batch_size', type=int, default=32) args = parser.parse_args() tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium') config = GPT2Config.from_pretrained('gpt2-medium', output_hidden_states=True) gpt2 = GPT2Model.from_pretrained('gpt2-medium', config=config).cuda() logging.getLogger("transformers.tokenization_utils").setLevel( logging.ERROR) for subsample in ["train", "test"]: if not os.path.isdir(subsample): os.mkdir(subsample) df = pd.read_csv('{}.csv'.format(subsample)) if os.path.isfile(f'{subsample}_tokens_gpt2.pkl'): print("Loading token ids...", file=sys.stderr) tokens = joblib.load(f'{subsample}_gpt2.pkl') else: print("Transforming texts to token ids...", file=sys.stderr) tokens = [tokenizer.encode(x) for x in tqdm(df.texts)] joblib.dump(tokens, f'{subsample}_gpt2.pkl') dataset = DiscourseDataset(tokens, pad_token_id=0, max_len=config.n_positions) dataloader = torch.utils.data.DataLoader(dataset, batch_size=args.batch_size) gpt2.eval() mean_results, max_results = list(), list() with torch.no_grad(): for num, (token_ids, attention_ids) in enumerate(tqdm(dataloader), 1): _, _, hidden_states = gpt2(token_ids, attention_mask=attention_ids) hidden_states_cpu = [x.cpu().numpy() for x in hidden_states] del hidden_states gc.collect() output = hidden_states_cpu[args.hidden_layer_num] del hidden_states_cpu sentence_lens = attention_ids.sum(1).cpu().numpy() output_zero_padding = output.transpose([2, 0, 1]) * attention_ids.cpu().numpy() output_zero_padding = output_zero_padding.transpose([1, 2, 0]) mean_result = (output_zero_padding.sum(1).T / sentence_lens).T max_result = np.array([matrix[:length].max(0) for matrix, length in zip(output_zero_padding, sentence_lens)]) mean_results.append(mean_result) max_results.append(max_result) torch.cuda.empty_cache() np.save(f'{subsample}/gpt2_mean_embeddings_layer_{args.hidden_layer_num}', np.vstack(mean_results)) np.save(f'{subsample}/gpt2_max_embeddings_layer_{args.hidden_layer_num}', np.vstack(max_results))
def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.debias_head = nn.functional.linear self.multiple_choice_head = SequenceSummary(config) self.init_weights()