def __init__(self, config): print("************ THIS MODEL COMES FROM CS224N PROJECT ************") super().__init__(config) self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights()
def __init__( self, config, class_labels, pretrained_model_path, dropout=0.1, freeze_pretrained_part=True, reinitialize=False, n_layers=6, ): super().__init__(config, class_labels) if reinitialize: logger.info('resetting model weights') config = GPT2Config.from_json_file(pretrained_model_path + '/config.json') config = config.to_dict() config['n_layer'] = n_layers config = GPT2Config.from_dict(config) self.gpt2 = GPT2Model(config) else: self.gpt2 = GPT2Model.from_pretrained(pretrained_model_path) self.dropout = torch.nn.Dropout(dropout) self.fc = torch.nn.Linear(self.gpt2.config.n_embd, self.output_dim) if freeze_pretrained_part: for param in self.gpt2.parameters(): param.requires_grad = False
def create_and_check_gpt2_weight_initialization(self, config, *args): model = GPT2Model(config) model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer) for key in model.state_dict().keys(): if "c_proj" in key and "weight" in key: self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001) self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
def __init__(self, config, num_output_labels=4): config.output_attentions = True super(GPT2ClassificationModel, self).__init__(config) self.transformer = GPT2Model(config) self.CNN_Max = nn.Sequential( # Defining a 2D convolution layer nn.Conv2d(1, 4, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(4), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), # Defining another 2D convolution layer nn.Conv2d(4, 4, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(4), nn.ReLU(inplace=True), nn.MaxPool2d(kernel_size=2, stride=2), ) self.CNN_Avg = nn.Sequential( # Defining a 2D convolution layer nn.Conv2d(1, 4, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(4), nn.ReLU(inplace=True), nn.AvgPool2d(kernel_size=2, stride=2), # Defining another 2D convolution layer nn.Conv2d(4, 4, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(4), nn.ReLU(inplace=True), nn.AvgPool2d(kernel_size=2, stride=2), ) self.ff_layers = nn.Sequential(nn.Linear(256, 10), nn.Linear(10, num_output_labels)) self.final_softmax = nn.Softmax(dim=1) self.init_weights()
def create_and_check_gpt2_model_past_large_inputs( self, config, input_ids, input_mask, head_mask, token_type_ids, *args ): model = GPT2Model(config=config) model.to(torch_device) model.eval() # first forward pass outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) output, past = outputs.to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size) # append to next input_ids and token_type_ids next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"] output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"] self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1]) # select random slice random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() # test that outputs are equal for slice self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def __init__(self, config, **kwargs): super().__init__(config) self.args = kwargs['args'] self.config = config # core gpt2 and lm head self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) # mention detection output index self.mc_cl2idx = {'<N>': 0, '<M>': 1, '</M>': 2} self.mc_idx2cl = {v: k for k, v in self.mc_cl2idx.items()} self.cl_head = nn.Linear(config.n_embd, 3) # head for 3 classes in mention dection # attention parameters in coref2qr mechanism if self.args.coref_attn_share_between_layer: self.c_attn = Conv1D(3 * config.n_embd, config.n_embd) else: self.c_attn = nn.ModuleList([ Conv1D(3 * config.n_embd, config.n_embd) for _ in range(self.config.n_layer + 1) ]) # binary classification for rewriting or not if self.args.use_binary_cls: self.binary_cls1 = nn.Linear(config.n_embd, config.n_embd) self.binary_cls2 = nn.Linear( config.n_embd, 2, bias=False) # output layer for rewrite or not self.init_weights()
def __init__(self, config): super().__init__(config) self.num_labels = config.num_labels self.transformers = GPT2Model(config) self.dropout = nn.Dropout(0.1) self.classifier = nn.Linear(config.hidden_size, config.num_labels) self.init_weights()
def __init__(self, config: Munch): r""" Init a new GPT2 synapse module. Args: config (:obj:`munch.Munch`, `required`): munched config class. """ super(GPT2LMSynapse, self).__init__(config=config) if config == None: config = GPT2LMSynapse.build_config() # Build hugging face config. huggingface_config = GPT2Config( vocab_size=bittensor.__vocab_size__, n_embd=bittensor.__network_dim__, n_layer=config.synapse.n_layer, n_head=config.synapse.n_head, n_inner=config.synapse.n_inner, activation_function=config.synapse.activation_function, resid_pdrop=config.synapse.resid_pdrop, embd_pdrop=config.synapse.embd_pdrop, attn_pdrop=config.synapse.attn_pdrop, layer_norm_epsilon=config.synapse.layer_norm_epsilon, initializer_range=config.synapse.initializer_range, summary_type=config.synapse.summary_type, summary_use_proj=config.synapse.summary_use_proj, summary_activation=config.synapse.summary_activation, summary_proj_to_labels=config.synapse.summary_proj_to_labels, summary_first_dropout=config.synapse.summary_first_dropout, ) # encoder_layer: encodes tokenized sequences to network dim. # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__] self.transformer = GPT2Model(huggingface_config) # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query. # [batch_size, bittensor.__network_dim__, sequence_len] -> [batch_size, bittensor.__network_dim__] self.pooler = GPT2Pooler(huggingface_config) # router: (PKM layer) queries network using pooled embeddings as context. # [batch_size, bittensor.__network_dim__] -> topk * [batch_size, bittensor.__network_dim__] self.router = PKMRouter(config, query_dim=bittensor.__network_dim__) # hidden_layer: transforms context and encoding to network_dim hidden units. # [batch_size, sequence_dim, 2 * bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__] self.hidden_layer = nn.Linear(bittensor.__network_dim__, bittensor.__network_dim__) # target_layer: maps from hidden layer to vocab dimension for each token. Used by MLM loss. # [batch_size, sequence_len, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__vocab_size__] self.target_layer = nn.Linear(bittensor.__network_dim__, bittensor.__vocab_size__, bias=False) # Loss function: MLM cross-entropy loss. # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1] self.loss_fct = nn.CrossEntropyLoss() self.to(self.device)
def create_and_check_gpt2_model_attention_mask_past( self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = GPT2Model(config=config) model.to(torch_device) model.eval() # create attention mask attn_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) half_seq_length = self.seq_length // 2 attn_mask[:, half_seq_length:] = 0 # first forward pass output, past = model(input_ids, attention_mask=attn_mask).to_tuple() # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) # change a random masked slice from input_ids random_seq_idx_to_change = ids_tensor( (1, ), half_seq_length).item() + 1 random_other_next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size).squeeze(-1) input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens # append to next input_ids and attn_mask next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) attn_mask = torch.cat( [ attn_mask, torch.ones((attn_mask.shape[0], 1), dtype=torch.long, device=torch_device) ], dim=1, ) # get two different outputs output_from_no_past = model( next_input_ids, attention_mask=attn_mask)["last_hidden_state"] output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"] # select random slice random_slice_idx = ids_tensor((1, ), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach( ) output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() # test that outputs are equal for slice self.parent.assertTrue( torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def __init__(self, config): super().__init__(config) # config.num_labels = 1 config.num_labels = le.classes_.shape[0] self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) self.init_weights()
def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.v_head = ValueHead(config) self.init_weights()
def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.cls_head = SequenceSummary(config) self.init_weights()
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, full, gpt2_config_file, pytorch_dump_folder_path): #putting requirements here so users can see usage info before it errors out on missing modules from io import open from shutil import copyfile import logging logging.basicConfig(level=logging.INFO) from pathlib import Path import torch #WEIGHTS_NAME = "pytorch_model.bin" #CONFIG_NAME = "config.json" from transformers import ( CONFIG_NAME, WEIGHTS_NAME, GPT2Config, GPT2Model, load_tf_weights_in_gpt2, ) gpt2_checkpoint_path = Path(gpt2_checkpoint_path) print(gpt2_checkpoint_path.name) if pytorch_dump_folder_path == '': prefix = '32BIT-' if full else '16BIT-' pytorch_dump_folder_path = 'pytorch-' + prefix + gpt2_checkpoint_path.name pytorch_dump_folder_path = Path(pytorch_dump_folder_path) pytorch_dump_folder_path.mkdir(exist_ok=True) # Construct model if gpt2_config_file == "": #This doesn't seem to work. We will use the hparams.json file that seems to be included in #config = GPT2Config() gpt2_config_file = gpt2_checkpoint_path / 'hparams.json' config = GPT2Config.from_json_file(gpt2_config_file) model = GPT2Model(config) # Load weights from numpy load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path) if not full: model.half() # Save pytorch-model pytorch_weights_dump_path = pytorch_dump_folder_path / WEIGHTS_NAME pytorch_config_dump_path = pytorch_dump_folder_path / CONFIG_NAME print("Save PyTorch model to {}".format(str(pytorch_weights_dump_path))) torch.save(model.state_dict(), pytorch_weights_dump_path) print("Save configuration file to: " + str(pytorch_config_dump_path)) with pytorch_config_dump_path.open("w", encoding="utf-8") as f: f.write(config.to_json_string()) copyfile(gpt2_checkpoint_path / 'vocab.bpe', pytorch_dump_folder_path / 'merges.txt') copyfile(gpt2_checkpoint_path / 'encoder.json', pytorch_dump_folder_path / 'vocab.json')
def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.debias_head = nn.functional.linear self.multiple_choice_head = SequenceSummary(config) self.init_weights()
def __init__(self, config, pad_id, bos_id, **kwargs): super().__init__() self.config = config self.pad_token_id = pad_id self.bos_token_id = bos_id self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.value_head = nn.Linear(config.n_embd, 1, bias=False)
def __init__(self, config): """ 初始化函数 Args: config: 配置参数 """ super().__init__(config) self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights()
def __init__(self, config): super(GPT2ClassHeadsModel, self).__init__(config) self.transformer = GPT2Model(config) self.classifier = nn.Linear(config.n_embd, 2) # self.classifier = nn.Sequential(nn.Linear(config.n_embd, 768), nn.ReLU(), nn.Dropout(p=0.2), # nn.Linear(768, 2)) # self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights()
def __init__(self, config, num_classes): """Constructor. Args: config (GPT2Config): Configurations of GPT2 model. num_classes (int): The number of objects for classification. """ super().__init__() self._bert = GPT2Model(config) self._linear = torch.nn.Linear(config.hidden_size, num_classes)
def __init__(self, config): super(GPT2ForSequenceRanking, self).__init__(config) self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) config.summary_type = 'mean' self.good_head = SequenceSummary(config) self.size = config.n_embd self.init_weights()
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = GPT2Model(config=config) model.to(torch_device) model.eval() result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) result = model(input_ids, token_type_ids=token_type_ids) result = model(input_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) self.parent.assertEqual(len(result.past_key_values), config.n_layer)
def __init__(self, config): super().__init__(config) config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.as2_head = AS2HeadModel(config) self.init_weights() self.loss_fct = CrossEntropyLoss() # self.loss_fct_as2 = CrossEntropyLoss(weight=torch.tensor(config.class_weights) ) self.loss_fct_as2 = MSELoss()
def __init__(self, config, quantization=None): super().__init__(config) self.num_labels = config.num_labels self.transformer = GPT2Model(config, quantization=quantization) self.score = nn.Linear(config.n_embd, self.num_labels, bias=False) self.init_weights() # Model parallel self.model_parallel = False self.device_map = None
def __init__(self, config, MAX_LEN, CAN_NUM, num_of_rerank): super().__init__(config) self.MAX_LEN = MAX_LEN self.CAN_NUM = CAN_NUM self.num_of_rerank = num_of_rerank self.VOCAB_SIZE = config.vocab_size self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights()
def __init__(self, config): super().__init__() self.config = config self.gpt = GPT2Model(config) self.policy_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.value_head = nn.Linear(config.n_embd, 1) self.n_params = sum( dict( (p.data_ptr(), p.numel()) for p in self.parameters()).values())
def __init__(self, config): super(GPT2LMHeadModel, self).__init__(config) self.transformer = GPT2Model(config) self.lm_head = nn.Linear( config.n_embd, config.vocab_size, bias=False) # GPT2LMHead(self.transformer.wte.weight, config) self.position_num_labels = 2 self.lambda_position = 0.1 self.position_classifier = GPT2ClassificationHead( num_labels=self.position_num_labels ) #GPT2LMHead(self.transformer.wte.weight, config) self.init_weights()
def create_and_check_gpt2_model_past(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = GPT2Model(config=config) model.to(torch_device) model.eval() # first forward pass outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True) outputs_use_cache_conf = model(input_ids, token_type_ids=token_type_ids) outputs_no_past = model(input_ids, token_type_ids=token_type_ids, use_cache=False) self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf)) self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1) output, past = outputs # create hypothetical next token and extent to next_input_ids next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size) next_token_types = ids_tensor([self.batch_size, 1], self.type_vocab_size) # append to next input_ids and token_type_ids next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1) output_from_no_past, _ = model(next_input_ids, token_type_ids=next_token_type_ids) output_from_past, _ = model(next_tokens, token_type_ids=next_token_types, past=past) # select random slice random_slice_idx = ids_tensor((1, ), output_from_past.shape[-1]).item() output_from_no_past_slice = output_from_no_past[:, -1, random_slice_idx].detach( ) output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach() # test that outputs are equal for slice self.parent.assertTrue( torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
def dummy_gpt2(): set_seed(RANDOM_SEED) config = { "vocab_size": 9906, "n_positions": 128, "n_ctx": 128, "n_embd": 512, "n_layer": 6, "n_head": 8, } config = GPT2Config(**config) model = GPT2Model(config) return model
def init_data(self, use_cuda) -> None: torch.set_grad_enabled(False) torch.set_num_threads(4) turbo_transformers.set_num_threads(4) self.test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') self.cfg = GPT2Config() self.torch_model = GPT2Model(self.cfg) self.torch_model.eval() if torch.cuda.is_available(): self.torch_model.to(self.test_device) self.turbo_model = turbo_transformers.GPT2Model.from_torch( self.torch_model, self.test_device)
def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args): model = GPT2Model(config=config) model.eval() model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask) model(input_ids, token_type_ids=token_type_ids) sequence_output, presents = model(input_ids) result = { "sequence_output": sequence_output, "presents": presents, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertEqual(len(result["presents"]), config.n_layer)
def __init__(self, config): super().__init__(config) self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.init_weights() self.adapter_blocks = nn.ModuleList( [MixAdapter(config) for _ in range(config.n_layer)]) self.trs_head = nn.TransformerEncoder(nn.TransformerEncoderLayer( d_model=config.n_embd, nhead=2), num_layers=1) self.task_classification_head = nn.Sequential( nn.Linear(config.n_embd, config.n_embd), nn.ReLU(), nn.Linear(config.n_embd, config.n_embd), nn.ReLU(), nn.Linear(config.n_embd, 13), )