Exemple #1
0
    def __init__(self, config):
        print("************ THIS MODEL COMES FROM CS224N PROJECT ************")
        super().__init__(config)
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.init_weights()
Exemple #2
0
    def __init__(
            self,
            config,
            class_labels,
            pretrained_model_path,
            dropout=0.1,
            freeze_pretrained_part=True,
            reinitialize=False,
            n_layers=6,
    ):
        super().__init__(config, class_labels)

        if reinitialize:
            logger.info('resetting model weights')
            config = GPT2Config.from_json_file(pretrained_model_path + '/config.json')
            config = config.to_dict()
            config['n_layer'] = n_layers
            config = GPT2Config.from_dict(config)
            self.gpt2 = GPT2Model(config)
        else:
            self.gpt2 = GPT2Model.from_pretrained(pretrained_model_path)

        self.dropout = torch.nn.Dropout(dropout)
        self.fc = torch.nn.Linear(self.gpt2.config.n_embd, self.output_dim)
        if freeze_pretrained_part:
            for param in self.gpt2.parameters():
                param.requires_grad = False
 def create_and_check_gpt2_weight_initialization(self, config, *args):
     model = GPT2Model(config)
     model_std = model.config.initializer_range / math.sqrt(2 * model.config.n_layer)
     for key in model.state_dict().keys():
         if "c_proj" in key and "weight" in key:
             self.parent.assertLessEqual(abs(torch.std(model.state_dict()[key]) - model_std), 0.001)
             self.parent.assertLessEqual(abs(torch.mean(model.state_dict()[key]) - 0.0), 0.01)
 def __init__(self, config, num_output_labels=4):
     config.output_attentions = True
     super(GPT2ClassificationModel, self).__init__(config)
     self.transformer = GPT2Model(config)
     self.CNN_Max = nn.Sequential(
         # Defining a 2D convolution layer
         nn.Conv2d(1, 4, kernel_size=3, stride=1, padding=1),
         nn.BatchNorm2d(4),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=2, stride=2),
         # Defining another 2D convolution layer
         nn.Conv2d(4, 4, kernel_size=3, stride=1, padding=1),
         nn.BatchNorm2d(4),
         nn.ReLU(inplace=True),
         nn.MaxPool2d(kernel_size=2, stride=2),
     )
     self.CNN_Avg = nn.Sequential(
         # Defining a 2D convolution layer
         nn.Conv2d(1, 4, kernel_size=3, stride=1, padding=1),
         nn.BatchNorm2d(4),
         nn.ReLU(inplace=True),
         nn.AvgPool2d(kernel_size=2, stride=2),
         # Defining another 2D convolution layer
         nn.Conv2d(4, 4, kernel_size=3, stride=1, padding=1),
         nn.BatchNorm2d(4),
         nn.ReLU(inplace=True),
         nn.AvgPool2d(kernel_size=2, stride=2),
     )
     self.ff_layers = nn.Sequential(nn.Linear(256, 10),
                                    nn.Linear(10, num_output_labels))
     self.final_softmax = nn.Softmax(dim=1)
     self.init_weights()
Exemple #5
0
    def create_and_check_gpt2_model_past_large_inputs(
        self, config, input_ids, input_mask, head_mask, token_type_ids, *args
    ):
        model = GPT2Model(config=config)
        model.to(torch_device)
        model.eval()

        # first forward pass
        outputs = model(input_ids, token_type_ids=token_type_ids, use_cache=True)

        output, past = outputs.to_tuple()

        # create hypothetical next token and extent to next_input_ids
        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
        next_token_types = ids_tensor([self.batch_size, 3], self.type_vocab_size)

        # append to next input_ids and token_type_ids
        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)

        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
        output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"]
        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])

        # select random slice
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()

        # test that outputs are equal for slice
        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
Exemple #6
0
    def __init__(self, config, **kwargs):
        super().__init__(config)
        self.args = kwargs['args']
        self.config = config

        # core gpt2 and lm head
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        # mention detection output index
        self.mc_cl2idx = {'<N>': 0, '<M>': 1, '</M>': 2}
        self.mc_idx2cl = {v: k for k, v in self.mc_cl2idx.items()}
        self.cl_head = nn.Linear(config.n_embd,
                                 3)  # head for 3 classes in mention dection

        # attention parameters in coref2qr mechanism
        if self.args.coref_attn_share_between_layer:
            self.c_attn = Conv1D(3 * config.n_embd, config.n_embd)
        else:
            self.c_attn = nn.ModuleList([
                Conv1D(3 * config.n_embd, config.n_embd)
                for _ in range(self.config.n_layer + 1)
            ])

        # binary classification for rewriting or not
        if self.args.use_binary_cls:
            self.binary_cls1 = nn.Linear(config.n_embd, config.n_embd)
            self.binary_cls2 = nn.Linear(
                config.n_embd, 2,
                bias=False)  # output layer for rewrite or not

        self.init_weights()
Exemple #7
0
 def __init__(self, config):
     super().__init__(config)
     self.num_labels = config.num_labels
     self.transformers = GPT2Model(config)
     self.dropout = nn.Dropout(0.1)
     self.classifier = nn.Linear(config.hidden_size, config.num_labels)
     self.init_weights()
Exemple #8
0
    def __init__(self, config: Munch):
        r""" Init a new GPT2 synapse module.

            Args:
                config (:obj:`munch.Munch`, `required`): 
                    munched config class.
        """
        super(GPT2LMSynapse, self).__init__(config=config)
        if config == None:
            config = GPT2LMSynapse.build_config()

        # Build hugging face config.
        huggingface_config = GPT2Config(
            vocab_size=bittensor.__vocab_size__,
            n_embd=bittensor.__network_dim__,
            n_layer=config.synapse.n_layer,
            n_head=config.synapse.n_head,
            n_inner=config.synapse.n_inner,
            activation_function=config.synapse.activation_function,
            resid_pdrop=config.synapse.resid_pdrop,
            embd_pdrop=config.synapse.embd_pdrop,
            attn_pdrop=config.synapse.attn_pdrop,
            layer_norm_epsilon=config.synapse.layer_norm_epsilon,
            initializer_range=config.synapse.initializer_range,
            summary_type=config.synapse.summary_type,
            summary_use_proj=config.synapse.summary_use_proj,
            summary_activation=config.synapse.summary_activation,
            summary_proj_to_labels=config.synapse.summary_proj_to_labels,
            summary_first_dropout=config.synapse.summary_first_dropout,
        )

        # encoder_layer: encodes tokenized sequences to network dim.
        # [batch_size, sequence_len] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.transformer = GPT2Model(huggingface_config)

        # pooler_layer: pools the hidden units for use by the pkm dendrite rpc query.
        # [batch_size, bittensor.__network_dim__, sequence_len] -> [batch_size, bittensor.__network_dim__]
        self.pooler = GPT2Pooler(huggingface_config)

        # router: (PKM layer) queries network using pooled embeddings as context.
        # [batch_size, bittensor.__network_dim__] -> topk * [batch_size, bittensor.__network_dim__]
        self.router = PKMRouter(config, query_dim=bittensor.__network_dim__)

        # hidden_layer: transforms context and encoding to network_dim hidden units.
        # [batch_size, sequence_dim, 2 * bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__network_dim__]
        self.hidden_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__network_dim__)

        # target_layer: maps from hidden layer to vocab dimension for each token. Used by MLM loss.
        # [batch_size, sequence_len, bittensor.__network_dim__] -> [batch_size, sequence_len, bittensor.__vocab_size__]
        self.target_layer = nn.Linear(bittensor.__network_dim__,
                                      bittensor.__vocab_size__,
                                      bias=False)

        # Loss function: MLM cross-entropy loss.
        # predicted: [batch_size, sequence_len, 1], targets: [batch_size, sequence_len, 1] -> [1]
        self.loss_fct = nn.CrossEntropyLoss()

        self.to(self.device)
Exemple #9
0
    def create_and_check_gpt2_model_attention_mask_past(
            self, config, input_ids, input_mask, head_mask, token_type_ids,
            *args):
        model = GPT2Model(config=config)
        model.to(torch_device)
        model.eval()

        # create attention mask
        attn_mask = torch.ones(input_ids.shape,
                               dtype=torch.long,
                               device=torch_device)
        half_seq_length = self.seq_length // 2
        attn_mask[:, half_seq_length:] = 0

        # first forward pass
        output, past = model(input_ids, attention_mask=attn_mask).to_tuple()

        # create hypothetical next token and extent to next_input_ids
        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)

        # change a random masked slice from input_ids
        random_seq_idx_to_change = ids_tensor(
            (1, ), half_seq_length).item() + 1
        random_other_next_tokens = ids_tensor((self.batch_size, 1),
                                              config.vocab_size).squeeze(-1)
        input_ids[:, -random_seq_idx_to_change] = random_other_next_tokens

        # append to next input_ids and attn_mask
        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
        attn_mask = torch.cat(
            [
                attn_mask,
                torch.ones((attn_mask.shape[0], 1),
                           dtype=torch.long,
                           device=torch_device)
            ],
            dim=1,
        )

        # get two different outputs
        output_from_no_past = model(
            next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
        output_from_past = model(next_tokens,
                                 past_key_values=past,
                                 attention_mask=attn_mask)["last_hidden_state"]

        # select random slice
        random_slice_idx = ids_tensor((1, ), output_from_past.shape[-1]).item()
        output_from_no_past_slice = output_from_no_past[:, -1,
                                                        random_slice_idx].detach(
                                                        )
        output_from_past_slice = output_from_past[:, 0,
                                                  random_slice_idx].detach()

        # test that outputs are equal for slice
        self.parent.assertTrue(
            torch.allclose(output_from_past_slice,
                           output_from_no_past_slice,
                           atol=1e-3))
 def __init__(self, config):
     super().__init__(config)
     # config.num_labels = 1
     config.num_labels = le.classes_.shape[0]
     self.transformer = GPT2Model(config)
     self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
     self.multiple_choice_head = SequenceSummary(config)
     self.init_weights()
Exemple #11
0
    def __init__(self, config):
        super().__init__(config)
        config.num_labels = 1
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.v_head = ValueHead(config)

        self.init_weights()
    def __init__(self, config):
        super().__init__(config)
        config.num_labels = 1
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.cls_head = SequenceSummary(config)

        self.init_weights()
Exemple #13
0
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, full,
                                       gpt2_config_file,
                                       pytorch_dump_folder_path):
    #putting requirements here so users can see usage info before it errors out on missing modules
    from io import open
    from shutil import copyfile
    import logging
    logging.basicConfig(level=logging.INFO)
    from pathlib import Path
    import torch
    #WEIGHTS_NAME = "pytorch_model.bin"
    #CONFIG_NAME = "config.json"
    from transformers import (
        CONFIG_NAME,
        WEIGHTS_NAME,
        GPT2Config,
        GPT2Model,
        load_tf_weights_in_gpt2,
    )
    gpt2_checkpoint_path = Path(gpt2_checkpoint_path)
    print(gpt2_checkpoint_path.name)

    if pytorch_dump_folder_path == '':
        prefix = '32BIT-' if full else '16BIT-'
        pytorch_dump_folder_path = 'pytorch-' + prefix + gpt2_checkpoint_path.name
    pytorch_dump_folder_path = Path(pytorch_dump_folder_path)

    pytorch_dump_folder_path.mkdir(exist_ok=True)

    # Construct model
    if gpt2_config_file == "":
        #This doesn't seem to work. We will use the hparams.json file that seems to be included in
        #config = GPT2Config()
        gpt2_config_file = gpt2_checkpoint_path / 'hparams.json'

    config = GPT2Config.from_json_file(gpt2_config_file)
    model = GPT2Model(config)

    # Load weights from numpy
    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)
    if not full:
        model.half()

    # Save pytorch-model
    pytorch_weights_dump_path = pytorch_dump_folder_path / WEIGHTS_NAME
    pytorch_config_dump_path = pytorch_dump_folder_path / CONFIG_NAME
    print("Save PyTorch model to {}".format(str(pytorch_weights_dump_path)))

    torch.save(model.state_dict(), pytorch_weights_dump_path)

    print("Save configuration file to: " + str(pytorch_config_dump_path))
    with pytorch_config_dump_path.open("w", encoding="utf-8") as f:
        f.write(config.to_json_string())

    copyfile(gpt2_checkpoint_path / 'vocab.bpe',
             pytorch_dump_folder_path / 'merges.txt')
    copyfile(gpt2_checkpoint_path / 'encoder.json',
             pytorch_dump_folder_path / 'vocab.json')
    def __init__(self, config):
        super().__init__(config)
        config.num_labels = 1
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.debias_head = nn.functional.linear
        self.multiple_choice_head = SequenceSummary(config)

        self.init_weights()
Exemple #15
0
    def __init__(self, config, pad_id, bos_id, **kwargs):
        super().__init__()
        self.config = config
        self.pad_token_id = pad_id
        self.bos_token_id = bos_id

        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.value_head = nn.Linear(config.n_embd, 1, bias=False)
 def __init__(self, config):
     """
     初始化函数
     Args:
         config: 配置参数
     """
     super().__init__(config)
     self.transformer = GPT2Model(config)
     self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
     self.init_weights()
    def __init__(self, config):
        super(GPT2ClassHeadsModel, self).__init__(config)
        self.transformer = GPT2Model(config)

        self.classifier = nn.Linear(config.n_embd, 2)
        # self.classifier = nn.Sequential(nn.Linear(config.n_embd, 768), nn.ReLU(), nn.Dropout(p=0.2),
        #                                 nn.Linear(768, 2))
        # self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.init_weights()
Exemple #18
0
    def __init__(self, config, num_classes):
        """Constructor.

        Args:
            config (GPT2Config): Configurations of GPT2 model.
            num_classes (int): The number of objects for classification.
        """
        super().__init__()
        self._bert = GPT2Model(config)
        self._linear = torch.nn.Linear(config.hidden_size, num_classes)
Exemple #19
0
    def __init__(self, config):
        super(GPT2ForSequenceRanking, self).__init__(config)
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        config.summary_type = 'mean'
        self.good_head = SequenceSummary(config)

        self.size = config.n_embd

        self.init_weights()
Exemple #20
0
    def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
        model = GPT2Model(config=config)
        model.to(torch_device)
        model.eval()

        result = model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
        result = model(input_ids, token_type_ids=token_type_ids)
        result = model(input_ids)

        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
        self.parent.assertEqual(len(result.past_key_values), config.n_layer)
Exemple #21
0
    def __init__(self, config):
        super().__init__(config)
        config.num_labels = 1
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.as2_head = AS2HeadModel(config)

        self.init_weights()
        self.loss_fct = CrossEntropyLoss()
        # self.loss_fct_as2 = CrossEntropyLoss(weight=torch.tensor(config.class_weights)                                             )
        self.loss_fct_as2 = MSELoss()
Exemple #22
0
    def __init__(self, config, quantization=None):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.transformer = GPT2Model(config, quantization=quantization)
        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)

        self.init_weights()

        # Model parallel
        self.model_parallel = False
        self.device_map = None
Exemple #23
0
    def __init__(self, config, MAX_LEN, CAN_NUM, num_of_rerank):
        super().__init__(config)
        self.MAX_LEN = MAX_LEN
        self.CAN_NUM = CAN_NUM
        self.num_of_rerank = num_of_rerank
        self.VOCAB_SIZE = config.vocab_size

        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.init_weights()
Exemple #24
0
 def __init__(self, config):
     super().__init__()
     self.config = config
     self.gpt = GPT2Model(config)
     self.policy_head = nn.Linear(config.n_embd,
                                  config.vocab_size,
                                  bias=False)
     self.value_head = nn.Linear(config.n_embd, 1)
     self.n_params = sum(
         dict(
             (p.data_ptr(), p.numel()) for p in self.parameters()).values())
Exemple #25
0
 def __init__(self, config):
     super(GPT2LMHeadModel, self).__init__(config)
     self.transformer = GPT2Model(config)
     self.lm_head = nn.Linear(
         config.n_embd, config.vocab_size,
         bias=False)  # GPT2LMHead(self.transformer.wte.weight, config)
     self.position_num_labels = 2
     self.lambda_position = 0.1
     self.position_classifier = GPT2ClassificationHead(
         num_labels=self.position_num_labels
     )  #GPT2LMHead(self.transformer.wte.weight, config)
     self.init_weights()
    def create_and_check_gpt2_model_past(self, config, input_ids, input_mask,
                                         head_mask, token_type_ids, *args):
        model = GPT2Model(config=config)
        model.to(torch_device)
        model.eval()

        # first forward pass
        outputs = model(input_ids,
                        token_type_ids=token_type_ids,
                        use_cache=True)
        outputs_use_cache_conf = model(input_ids,
                                       token_type_ids=token_type_ids)
        outputs_no_past = model(input_ids,
                                token_type_ids=token_type_ids,
                                use_cache=False)

        self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
        self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)

        output, past = outputs

        # create hypothetical next token and extent to next_input_ids
        next_tokens = ids_tensor((self.batch_size, 1), config.vocab_size)
        next_token_types = ids_tensor([self.batch_size, 1],
                                      self.type_vocab_size)

        # append to next input_ids and token_type_ids
        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
        next_token_type_ids = torch.cat([token_type_ids, next_token_types],
                                        dim=-1)

        output_from_no_past, _ = model(next_input_ids,
                                       token_type_ids=next_token_type_ids)
        output_from_past, _ = model(next_tokens,
                                    token_type_ids=next_token_types,
                                    past=past)

        # select random slice
        random_slice_idx = ids_tensor((1, ), output_from_past.shape[-1]).item()
        output_from_no_past_slice = output_from_no_past[:, -1,
                                                        random_slice_idx].detach(
                                                        )
        output_from_past_slice = output_from_past[:, 0,
                                                  random_slice_idx].detach()

        # test that outputs are equal for slice
        self.parent.assertTrue(
            torch.allclose(output_from_past_slice,
                           output_from_no_past_slice,
                           atol=1e-3))
Exemple #27
0
def dummy_gpt2():
    set_seed(RANDOM_SEED)

    config = {
        "vocab_size": 9906,
        "n_positions": 128,
        "n_ctx": 128,
        "n_embd": 512,
        "n_layer": 6,
        "n_head": 8,
    }
    config = GPT2Config(**config)
    model = GPT2Model(config)
    return model
    def init_data(self, use_cuda) -> None:
        torch.set_grad_enabled(False)
        torch.set_num_threads(4)
        turbo_transformers.set_num_threads(4)
        self.test_device = torch.device('cuda:0') if use_cuda else \
            torch.device('cpu:0')

        self.cfg = GPT2Config()
        self.torch_model = GPT2Model(self.cfg)
        self.torch_model.eval()

        if torch.cuda.is_available():
            self.torch_model.to(self.test_device)

        self.turbo_model = turbo_transformers.GPT2Model.from_torch(
            self.torch_model, self.test_device)
        def create_and_check_gpt2_model(self, config, input_ids, input_mask, head_mask, token_type_ids, *args):
            model = GPT2Model(config=config)
            model.eval()

            model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
            model(input_ids, token_type_ids=token_type_ids)
            sequence_output, presents = model(input_ids)

            result = {
                "sequence_output": sequence_output,
                "presents": presents,
            }
            self.parent.assertListEqual(
                list(result["sequence_output"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertEqual(len(result["presents"]), config.n_layer)
Exemple #30
0
    def __init__(self, config):
        super().__init__(config)
        self.transformer = GPT2Model(config)
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
        self.init_weights()
        self.adapter_blocks = nn.ModuleList(
            [MixAdapter(config) for _ in range(config.n_layer)])
        self.trs_head = nn.TransformerEncoder(nn.TransformerEncoderLayer(
            d_model=config.n_embd, nhead=2),
                                              num_layers=1)

        self.task_classification_head = nn.Sequential(
            nn.Linear(config.n_embd, config.n_embd),
            nn.ReLU(),
            nn.Linear(config.n_embd, config.n_embd),
            nn.ReLU(),
            nn.Linear(config.n_embd, 13),
        )