def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training GPT2...') # Use fake train data batch_size = args.batch_size sequence_length = args.sequence_length hidden_size = args.hidden_size vocab_size = args.vocab_size dropout_prob = args.hidden_dropout input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) optimizer = torch.optim.SGD(embedding.parameters(), lr=0.01) profiler = Profiler(os.path.join('benchmark', args.exp_name)) num_epochs = 5 tot_time = 0 nproc = torch.distributed.get_world_size() for epoch in range(num_epochs): overall_name = f'emb_np-{nproc}_vs-{vocab_size}' profiler.start(overall_name) # Forward pass profiler.start(f'emb_forward_np-{nproc}_vs-{vocab_size}') embedding_output = embedding.forward(input_indices, position_indices) train_loss = torch.mean(embedding_output) torch.cuda.synchronize() profiler.stop(f'emb_forward_np-{nproc}_vs-{vocab_size}') # Backward pass profiler.start(f'emb_backward_np-{nproc}_vs-{vocab_size}') optimizer.zero_grad() train_loss.backward() optimizer.step() torch.cuda.synchronize() profiler.stop(f'emb_backward_np-{nproc}_vs-{vocab_size}') profiler.stop(overall_name)
def train(hidden_sizes, num_epochs=50): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training MLP...') # Use MNIST data train_data = np.genfromtxt('data/digitstrain.txt', delimiter=",") train_X = torch.tensor(train_data[:, :-1], dtype=torch.float, device=torch.cuda.current_device()) train_Y = torch.tensor(train_data[:, -1], dtype=torch.int64, device=torch.cuda.current_device()) print_rank_0(f'train_X shape: {train_X.size()}') print_rank_0(f'train_Y shape: {train_Y.size()}') num_features = train_X.size()[1] num_classes = 10 assert num_features == 28 * 28 mlp = ParallelMLP(num_features=num_features, num_classes=num_classes, hidden_sizes=hidden_sizes) print_rank_0('AutoMP: Successfully initialized ParallelMLP') criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.SGD(mlp.parameters(), lr=0.01) # num_epochs = 500 num_train_samples = train_X.size()[0] batch_size = num_train_samples tot_time = 0 for epoch in range(num_epochs): start_time = time.time() train_loss = 0 for sample_idx in range(0, num_train_samples, batch_size): mini_batch = train_X[sample_idx:sample_idx + batch_size, ...] labels = train_Y[sample_idx:sample_idx + batch_size] # Forward pass logits = mlp(mini_batch) # Note: torch.nn.CrossEntropyLoss does not need one hot encoding loss = criterion(logits, labels) # loss = parallel_cross_entropy(logits, labels) train_loss += loss # Backward pass optimizer.zero_grad() loss.backward() optimizer.step() train_loss /= (num_train_samples / batch_size) # if epoch % 50 == 0: print_rank_0( f'Epoch Number {epoch}: train loss: {train_loss}, time: {time.time()-start_time}' ) tot_time += time.time() - start_time print_rank_0(f'!!! AVG EPOCH TIME: {tot_time/num_epochs}')
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training GPT2...') # Use fake train data args = get_args() sequence_length = 1024 vocab_size = 4096 dropout_prob = 0.1 input_indices = torch.randint(low=0, high=vocab_size, size=(args.batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (args.batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=args.hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) embedding_output = embedding.forward(input_indices, position_indices) # print_rank_0(f'AutoMP: embedding_output = {embedding_output}') def gpt2_attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores transformer = ParallelTransformer( attention_mask_func=gpt2_attention_mask_func, num_layers=args.num_layers, hidden_size=args.hidden_size, layernorm_epsilon=args.layernorm_epsilon, num_attention_heads=args.num_attention_heads, attention_dropout=0.1, hidden_dropout=0.1) attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( input_indices, vocab_size - 1) transformer_output = transformer.forward(hidden_states=embedding_output, attention_mask=attention_mask) print_rank_0(f'AutoMP: transformer_output = {transformer_output}')
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training self attention layer...') # Use fake train data args = get_args() batch_size = 32 sequence_length = 1024 hidden_size = args.hidden_size vocab_size = 4096 dropout_prob = 0.1 input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}') print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}') def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=dropout_prob, init_method=init_method_normal) embedding_output = embedding.forward(input_indices, position_indices) # print_rank_0(f'AutoMP: embedding_output = {embedding_output}') def gpt2_attention_mask_func(attention_scores, ltor_mask): print(f'ALBERT_DEBUG: attention_scores.size() = {attention_scores.size()}') print(f'ALBERT_DEBUG: ltor_mask.size() = {ltor_mask.size()}') attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores self_attention = ParallelSelfAttention( attention_mask_func=gpt2_attention_mask_func, hidden_size=args.hidden_size, num_attention_heads=args.num_attention_heads, attention_dropout=0.1 ) attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1) print(f'ALBERT_DEBUG: embedding_output.size() = {embedding_output.size()}') self_att_output = self_attention.forward(hidden_states=embedding_output, attention_mask=attention_mask) print_rank_0(f'AutoMP: self_att_output = {self_att_output}')
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training GPT2...') batch_size = args.batch_size sequence_length = args.sequence_length hidden_size = args.hidden_size vocab_size = args.vocab_size hidden_dropout = args.hidden_dropout attention_dropout = args.attention_dropout num_layers = args.num_layers layernorm_epsilon = args.layernorm_epsilon num_attention_heads = args.num_attention_heads input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) labels = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) labels = labels.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) def gpt2_attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores gpt2 = GPT2( hidden_size, vocab_size, sequence_length, hidden_dropout, gpt2_attention_mask_func, num_layers, layernorm_epsilon, num_attention_heads, attention_dropout, init_method_normal, ) num_params = sum(p.numel() for p in gpt2.parameters() if p.requires_grad) attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( input_indices, vocab_size - 1) optimizer = torch.optim.SGD(gpt2.parameters(), lr=0.01) profiler = Profiler(os.path.join('benchmark', args.exp_name)) num_epochs = 5 tot_time = 0 nproc = torch.distributed.get_world_size() for epoch in range(num_epochs): overall_name = f'gpt2_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}_num-params-{num_params}' profiler.start(overall_name) fname = f'gpt2_forward_np-{nproc}_hs-{hidden_size}_nl-{num_layers}_nah-{num_attention_heads}_bsz-{batch_size}_num-params-{num_params}' # Forward pass profiler.start(fname) loss = gpt2.forward(input_indices, position_indices, attention_mask, labels) train_loss = torch.mean(loss) # print(train_loss) torch.cuda.synchronize() profiler.stop(fname) # Backward pass bname = f'gpt2_backward_np-{nproc}_hs-{hidden_size}_nl-{num_layers}_nah-{num_attention_heads}_bsz-{batch_size}_num-params-{num_params}' profiler.start(bname) optimizer.zero_grad() train_loss.backward() optimizer.step() torch.cuda.synchronize() profiler.stop(bname) profiler.stop(overall_name)
def train(): # Initialize torch.distributed init_distributed() print_rank_0('AutoMP: training ParallelTransformerLayer...') batch_size = args.batch_size sequence_length = args.sequence_length hidden_size = args.hidden_size vocab_size = args.vocab_size hidden_dropout = args.hidden_dropout attention_dropout = args.attention_dropout num_layers = args.num_layers layernorm_epsilon = args.layernorm_epsilon num_attention_heads = args.num_attention_heads input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) input_indices = input_indices.to(torch.cuda.current_device()) labels = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length)) labels = labels.to(torch.cuda.current_device()) position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1)) position_indices = position_indices.to(torch.cuda.current_device()) def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) def gpt2_attention_mask_func(attention_scores, ltor_mask): attention_scores.masked_fill_(ltor_mask, -10000.0) return attention_scores def init_method_normal(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=1.0) embedding = Embedding(hidden_size=hidden_size, vocab_size=vocab_size, max_sequence_length=sequence_length, embedding_dropout_prob=hidden_dropout, init_method=init_method_normal) embedding_output = embedding.forward(input_indices, position_indices) transformer_layer = ParallelTransformerLayer( attention_mask_func=gpt2_attention_mask_func, layer_number=0, hidden_size=hidden_size, layernorm_epsilon=layernorm_epsilon, num_attention_heads=num_attention_heads, attention_dropout=attention_dropout, hidden_dropout=hidden_dropout) # attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1) attention_mask = (torch.randint( low=0, high=2, size=(sequence_length, divide(num_attention_heads, torch.distributed.get_world_size()), batch_size, batch_size)) < 0).cuda() optimizer = torch.optim.SGD(transformer_layer.parameters(), lr=0.01) profiler = Profiler(os.path.join('benchmark', args.exp_name)) num_epochs = 5 tot_time = 0 nproc = torch.distributed.get_world_size() for epoch in range(num_epochs): input_ = torch.rand(size=embedding_output.size()).cuda() overall_name = f'transformer_layer_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}' profiler.start(overall_name) fname = f'transformer_layer_forward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}' # Forward pass profiler.start(fname) loss = transformer_layer.forward(input_, attention_mask) train_loss = torch.mean(loss) # print(train_loss) torch.cuda.synchronize() profiler.stop(fname) # Backward pass bname = f'transformer_layer_backward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}' profiler.start(bname) optimizer.zero_grad() train_loss.backward() optimizer.step() torch.cuda.synchronize() profiler.stop(bname) profiler.stop(overall_name)