def test_create_attention_mask(self): config = GPTNeoConfig.from_pretrained("valhalla/gpt-neo-random-tiny") window_size = config.window_size batch_size, seq_length = 8, 1 block_length, num_blocks = GPTNeoAttentionMixin._get_block_length_and_num_blocks( seq_length, window_size) # causal_mask = layer._create_attention_mask(batch_size, seq_length, num_blocks, block_length, torch_device) causal_mask = GPTNeoAttentionMixin.create_local_attention_mask( batch_size, seq_length, config.window_size, torch_device) # check shapes expected_shape = [ batch_size, num_blocks, 1, block_length, window_size + block_length ] self.assertListEqual(list(causal_mask.shape), expected_shape) # first window_size tokens in the first block are always padded # and should not be attended self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0)) # each window can attend at most window_size tokens self.assertTrue( torch.all(torch.sum(causal_mask, dim=4) <= config.window_size)) # check if user provided attention_mask is handled correctly attention_mask = torch.ones(batch_size, seq_length, dtype=torch.long, device=torch_device) attention_mask[:, -3:] = 0 # don't attend last 3 tokens # causal_mask = layer._create_attention_mask( # batch_size, seq_length, num_blocks, block_length, torch_device, attention_mask # ) causal_mask = GPTNeoAttentionMixin.create_local_attention_mask( batch_size, seq_length, config.window_size, torch_device, attention_mask) # last 3 tokens will be in the last block and shoul have 0s in causal_mask self.assertTrue(torch.all(causal_mask[:, -1, :, :, -3:] == 0)) # check shapes expected_shape = [ batch_size, num_blocks, 1, block_length, window_size + block_length ] self.assertListEqual(list(causal_mask.shape), expected_shape) # first window_size tokens in the first block are always padded # and should not be attended self.assertTrue(torch.all(causal_mask[:, 0, :, :, :window_size] == 0)) # each window can attend at most window_size tokens self.assertTrue( torch.all(torch.sum(causal_mask, dim=4) <= config.window_size))
def get_large_model_config(self): return GPTNeoConfig.from_pretrained("gpt-neo-125M")
GPT2Tokenizer, GPTNeoConfig, AdamW from torch.utils.data import IterableDataset, DataLoader from lm_dataformat import * import torch import torch.nn.functional as F from torch.nn.functional import normalize, cross_entropy from torch.nn import DataParallel from auto_tqdm import tqdm from get_args import get_args import deepspeed args = get_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #create model, set neo_hidden conf = GPTNeoConfig.from_pretrained("EleutherAI/gpt-neo-1.3B") conf.gradient_checkpointing = True model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", config=conf) model.training = True tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B") neo_hidden = model.config.hidden_size #resize token embeddings. Two extra tokens model.resize_token_embeddings(len(tokenizer) + 2) #Set up deep speed model_engine, optimizer, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) model_engine.to(model_engine.local_rank) #Initialize a random projection matrix clip_hidden = 512 projection = torch.nn.Linear(neo_hidden, clip_hidden,