Beispiel #1
0
    def test_create_attention_mask(self):
        config = GPTNeoConfig.from_pretrained("valhalla/gpt-neo-random-tiny")
        window_size = config.window_size
        batch_size, seq_length = 8, 1
        block_length, num_blocks = GPTNeoAttentionMixin._get_block_length_and_num_blocks(
            seq_length, window_size)

        # causal_mask = layer._create_attention_mask(batch_size, seq_length, num_blocks, block_length, torch_device)
        causal_mask = GPTNeoAttentionMixin.create_local_attention_mask(
            batch_size, seq_length, config.window_size, torch_device)
        # check shapes
        expected_shape = [
            batch_size, num_blocks, 1, block_length, window_size + block_length
        ]
        self.assertListEqual(list(causal_mask.shape), expected_shape)
        # first window_size tokens in the first block are always padded
        # and should not be attended
        self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0))
        # each window can attend at most window_size tokens
        self.assertTrue(
            torch.all(torch.sum(causal_mask, dim=4) <= config.window_size))

        # check if user provided attention_mask is handled correctly
        attention_mask = torch.ones(batch_size,
                                    seq_length,
                                    dtype=torch.long,
                                    device=torch_device)
        attention_mask[:, -3:] = 0  # don't attend last 3 tokens

        # causal_mask = layer._create_attention_mask(
        # batch_size, seq_length, num_blocks, block_length, torch_device, attention_mask
        # )
        causal_mask = GPTNeoAttentionMixin.create_local_attention_mask(
            batch_size, seq_length, config.window_size, torch_device,
            attention_mask)
        # last 3 tokens will be in the last block and shoul have 0s in causal_mask
        self.assertTrue(torch.all(causal_mask[:, -1, :, :, -3:] == 0))
        # check shapes
        expected_shape = [
            batch_size, num_blocks, 1, block_length, window_size + block_length
        ]
        self.assertListEqual(list(causal_mask.shape), expected_shape)
        # first window_size tokens in the first block are always padded
        # and should not be attended
        self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0))
        # each window can attend at most window_size tokens
        self.assertTrue(
            torch.all(torch.sum(causal_mask, dim=4) <= config.window_size))
 def get_large_model_config(self):
     return GPTNeoConfig.from_pretrained("gpt-neo-125M")
Beispiel #3
0
    GPT2Tokenizer, GPTNeoConfig, AdamW
from torch.utils.data import IterableDataset, DataLoader
from lm_dataformat import *
import torch
import torch.nn.functional as F
from torch.nn.functional import normalize, cross_entropy
from torch.nn import DataParallel
from auto_tqdm import tqdm
from get_args import get_args
import deepspeed

args = get_args()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#create model, set neo_hidden
conf = GPTNeoConfig.from_pretrained("EleutherAI/gpt-neo-1.3B")
conf.gradient_checkpointing = True
model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B",
                                          config=conf)
model.training = True
tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
neo_hidden = model.config.hidden_size
#resize token embeddings. Two extra tokens
model.resize_token_embeddings(len(tokenizer) + 2)
#Set up deep speed
model_engine, optimizer, _, _ = deepspeed.initialize(
    args=args, model=model, model_parameters=model.parameters())
model_engine.to(model_engine.local_rank)
#Initialize a random projection matrix
clip_hidden = 512
projection = torch.nn.Linear(neo_hidden, clip_hidden,