def prepare_config_and_inputs(self, gradient_checkpointing=False):
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask(
                [self.batch_size, self.seq_length])

        config = GPTNeoConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_layers=self.num_hidden_layers,
            num_heads=self.num_attention_heads,
            max_position_embeddings=self.max_position_embeddings,
            use_cache=False,
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
            pad_token_id=self.pad_token_id,
            window_size=self.window_size,
            attention_types=self.attention_types,
            gradient_checkpointing=gradient_checkpointing,
        )

        return (config, input_ids, input_mask)
Esempio n. 2
0
    def prepare_config_and_inputs(self, gradient_checkpointing=False):
        input_ids = ids_tensor([self.batch_size, self.seq_length],
                               self.vocab_size)

        input_mask = None
        if self.use_input_mask:
            input_mask = random_attention_mask(
                [self.batch_size, self.seq_length])

        token_type_ids = None
        if self.use_token_type_ids:
            token_type_ids = ids_tensor([self.batch_size, self.seq_length],
                                        self.type_vocab_size)

        mc_token_ids = None
        if self.use_mc_token_ids:
            mc_token_ids = ids_tensor([self.batch_size, self.num_choices],
                                      self.seq_length)

        sequence_labels = None
        token_labels = None
        choice_labels = None
        if self.use_labels:
            sequence_labels = ids_tensor([self.batch_size],
                                         self.type_sequence_label_size)
            token_labels = ids_tensor([self.batch_size, self.seq_length],
                                      self.num_labels)
            choice_labels = ids_tensor([self.batch_size], self.num_choices)

        config = GPTNeoConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.hidden_size,
            num_layers=self.num_hidden_layers,
            num_heads=self.num_attention_heads,
            max_position_embeddings=self.max_position_embeddings,
            use_cache=not gradient_checkpointing,
            bos_token_id=self.bos_token_id,
            eos_token_id=self.eos_token_id,
            pad_token_id=self.pad_token_id,
            gradient_checkpointing=gradient_checkpointing,
            window_size=self.window_size,
            attention_types=self.attention_types,
        )

        head_mask = ids_tensor(
            [self.num_hidden_layers, self.num_attention_heads], 2)

        return (
            config,
            input_ids,
            input_mask,
            head_mask,
            token_type_ids,
            mc_token_ids,
            sequence_labels,
            token_labels,
            choice_labels,
        )
 def get_config(self):
     return GPTNeoConfig(
         vocab_size=self.vocab_size,
         hidden_size=self.hidden_size,
         num_layers=self.num_hidden_layers,
         num_heads=self.num_attention_heads,
         max_position_embeddings=self.max_position_embeddings,
         use_cache=True,
         bos_token_id=self.bos_token_id,
         eos_token_id=self.eos_token_id,
         pad_token_id=self.pad_token_id,
         window_size=self.window_size,
         attention_types=self.attention_types,
     )
Esempio n. 4
0
 def get_config(self, gradient_checkpointing=False):
     return GPTNeoConfig(
         vocab_size=self.vocab_size,
         hidden_size=self.hidden_size,
         num_layers=self.num_hidden_layers,
         num_heads=self.num_attention_heads,
         max_position_embeddings=self.max_position_embeddings,
         use_cache=not gradient_checkpointing,
         bos_token_id=self.bos_token_id,
         eos_token_id=self.eos_token_id,
         pad_token_id=self.pad_token_id,
         gradient_checkpointing=gradient_checkpointing,
         window_size=self.window_size,
         attention_types=self.attention_types,
     )
Esempio n. 5
0
    def test_create_attention_mask(self):
        config = GPTNeoConfig.from_pretrained("valhalla/gpt-neo-random-tiny")
        window_size = config.window_size
        batch_size, seq_length = 8, 1
        block_length, num_blocks = GPTNeoAttentionMixin._get_block_length_and_num_blocks(
            seq_length, window_size)

        # causal_mask = layer._create_attention_mask(batch_size, seq_length, num_blocks, block_length, torch_device)
        causal_mask = GPTNeoAttentionMixin.create_local_attention_mask(
            batch_size, seq_length, config.window_size, torch_device)
        # check shapes
        expected_shape = [
            batch_size, num_blocks, 1, block_length, window_size + block_length
        ]
        self.assertListEqual(list(causal_mask.shape), expected_shape)
        # first window_size tokens in the first block are always padded
        # and should not be attended
        self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0))
        # each window can attend at most window_size tokens
        self.assertTrue(
            torch.all(torch.sum(causal_mask, dim=4) <= config.window_size))

        # check if user provided attention_mask is handled correctly
        attention_mask = torch.ones(batch_size,
                                    seq_length,
                                    dtype=torch.long,
                                    device=torch_device)
        attention_mask[:, -3:] = 0  # don't attend last 3 tokens

        # causal_mask = layer._create_attention_mask(
        # batch_size, seq_length, num_blocks, block_length, torch_device, attention_mask
        # )
        causal_mask = GPTNeoAttentionMixin.create_local_attention_mask(
            batch_size, seq_length, config.window_size, torch_device,
            attention_mask)
        # last 3 tokens will be in the last block and shoul have 0s in causal_mask
        self.assertTrue(torch.all(causal_mask[:, -1, :, :, -3:] == 0))
        # check shapes
        expected_shape = [
            batch_size, num_blocks, 1, block_length, window_size + block_length
        ]
        self.assertListEqual(list(causal_mask.shape), expected_shape)
        # first window_size tokens in the first block are always padded
        # and should not be attended
        self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0))
        # each window can attend at most window_size tokens
        self.assertTrue(
            torch.all(torch.sum(causal_mask, dim=4) <= config.window_size))
Esempio n. 6
0
def GPTNeoConfigCPU(vocab_size: int = 1000,
                    bos_token_id: int = 0,
                    eos_token_id: int = 0,
                    **kwargs):
    """
    Returns a GPT Neo config more suitable for training on a regular consumer CPU.
    """

    return GPTNeoConfig(
        vocab_size=vocab_size,
        max_position_embeddings=64,
        hidden_size=256,
        window_size=32,
        intermediate_size=256,
        attention_types=[[["global", "local"], 2]],
        num_layers=4,
        num_heads=4,
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        **kwargs,
    )
Esempio n. 7
0
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model
    config_json = json.load(open(config_file, "r"))
    config = GPTNeoConfig(
        hidden_size=config_json["n_embd"],
        num_layers=config_json["n_layer"],
        num_heads=config_json["n_head"],
        attention_types=config_json["attention_types"],
        max_position_embeddings=config_json["n_positions"],
        resid_dropout=config_json["res_dropout"],
        embed_dropout=config_json["embed_dropout"],
        attention_dropout=config_json["attn_dropout"],
    )
    print(f"Building PyTorch model from configuration: {config}")
    model = GPTNeoForCausalLM(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_gpt_neo(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print(f"Save PyTorch model to {pytorch_dump_path}")
    model.save_pretrained(pytorch_dump_path)
 def get_large_model_config(self):
     return GPTNeoConfig.from_pretrained("gpt-neo-125M")
Esempio n. 9
0
    GPT2Tokenizer, GPTNeoConfig, AdamW
from torch.utils.data import IterableDataset, DataLoader
from lm_dataformat import *
import torch
import torch.nn.functional as F
from torch.nn.functional import normalize, cross_entropy
from torch.nn import DataParallel
from auto_tqdm import tqdm
from get_args import get_args
import deepspeed

args = get_args()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#create model, set neo_hidden
conf = GPTNeoConfig.from_pretrained("EleutherAI/gpt-neo-1.3B")
conf.gradient_checkpointing = True
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B",
                                          config=conf)
model.training = True
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
neo_hidden = model.config.hidden_size
#resize token embeddings. Two extra tokens
model.resize_token_embeddings(len(tokenizer) + 2)
#Set up deep speed
model_engine, optimizer, _, _ = deepspeed.initialize(
    args=args, model=model, model_parameters=model.parameters())
model_engine.to(model_engine.local_rank)
#Initialize a random projection matrix
clip_hidden = 512
projection = torch.nn.Linear(neo_hidden, clip_hidden,