def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training GPT2...')
    # Use fake train data
    batch_size = args.batch_size
    sequence_length = args.sequence_length
    hidden_size = args.hidden_size
    vocab_size = args.vocab_size
    dropout_prob = args.hidden_dropout

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=dropout_prob,
                          init_method=init_method_normal)

    optimizer = torch.optim.SGD(embedding.parameters(), lr=0.01)

    profiler = Profiler(os.path.join('benchmark', args.exp_name))

    num_epochs = 5
    tot_time = 0
    nproc = torch.distributed.get_world_size()

    for epoch in range(num_epochs):
        overall_name = f'emb_np-{nproc}_vs-{vocab_size}'
        profiler.start(overall_name)

        # Forward pass
        profiler.start(f'emb_forward_np-{nproc}_vs-{vocab_size}')
        embedding_output = embedding.forward(input_indices, position_indices)
        train_loss = torch.mean(embedding_output)
        torch.cuda.synchronize()
        profiler.stop(f'emb_forward_np-{nproc}_vs-{vocab_size}')

        # Backward pass
        profiler.start(f'emb_backward_np-{nproc}_vs-{vocab_size}')
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        profiler.stop(f'emb_backward_np-{nproc}_vs-{vocab_size}')

        profiler.stop(overall_name)
Beispiel #2
0
def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training GPT2...')
    # Use fake train data
    args = get_args()
    sequence_length = 1024
    vocab_size = 4096
    dropout_prob = 0.1

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(args.batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (args.batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=args.hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=dropout_prob,
                          init_method=init_method_normal)

    embedding_output = embedding.forward(input_indices, position_indices)

    # print_rank_0(f'AutoMP: embedding_output = {embedding_output}')

    def gpt2_attention_mask_func(attention_scores, ltor_mask):
        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    transformer = ParallelTransformer(
        attention_mask_func=gpt2_attention_mask_func,
        num_layers=args.num_layers,
        hidden_size=args.hidden_size,
        layernorm_epsilon=args.layernorm_epsilon,
        num_attention_heads=args.num_attention_heads,
        attention_dropout=0.1,
        hidden_dropout=0.1)

    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
        input_indices, vocab_size - 1)

    transformer_output = transformer.forward(hidden_states=embedding_output,
                                             attention_mask=attention_mask)
    print_rank_0(f'AutoMP: transformer_output = {transformer_output}')
def train():
    
    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training self attention layer...')
    # Use fake train data
    args = get_args()
    batch_size = 32
    sequence_length = 1024
    hidden_size = args.hidden_size
    vocab_size = 4096
    dropout_prob = 0.1

    input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)
    embedding = Embedding(hidden_size=hidden_size, 
              vocab_size=vocab_size, 
              max_sequence_length=sequence_length, 
              embedding_dropout_prob=dropout_prob, 
              init_method=init_method_normal)

    embedding_output = embedding.forward(input_indices, position_indices)
    # print_rank_0(f'AutoMP: embedding_output = {embedding_output}')

    def gpt2_attention_mask_func(attention_scores, ltor_mask):

        print(f'ALBERT_DEBUG: attention_scores.size() = {attention_scores.size()}')
        print(f'ALBERT_DEBUG: ltor_mask.size() = {ltor_mask.size()}')

        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    self_attention = ParallelSelfAttention(
        attention_mask_func=gpt2_attention_mask_func, 
        hidden_size=args.hidden_size, 
        num_attention_heads=args.num_attention_heads, 
        attention_dropout=0.1
    )

    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1)

    print(f'ALBERT_DEBUG: embedding_output.size() = {embedding_output.size()}')

    self_att_output = self_attention.forward(hidden_states=embedding_output, attention_mask=attention_mask)
    print_rank_0(f'AutoMP: self_att_output = {self_att_output}')
def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training ParallelTransformerLayer...')

    batch_size = args.batch_size
    sequence_length = args.sequence_length
    hidden_size = args.hidden_size
    vocab_size = args.vocab_size
    hidden_dropout = args.hidden_dropout
    attention_dropout = args.attention_dropout
    num_layers = args.num_layers
    layernorm_epsilon = args.layernorm_epsilon
    num_attention_heads = args.num_attention_heads

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    labels = torch.randint(low=0,
                           high=vocab_size,
                           size=(batch_size, sequence_length))
    labels = labels.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    def gpt2_attention_mask_func(attention_scores, ltor_mask):
        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=hidden_dropout,
                          init_method=init_method_normal)
    embedding_output = embedding.forward(input_indices, position_indices)

    transformer_layer = ParallelTransformerLayer(
        attention_mask_func=gpt2_attention_mask_func,
        layer_number=0,
        hidden_size=hidden_size,
        layernorm_epsilon=layernorm_epsilon,
        num_attention_heads=num_attention_heads,
        attention_dropout=attention_dropout,
        hidden_dropout=hidden_dropout)

    # attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1)
    attention_mask = (torch.randint(
        low=0,
        high=2,
        size=(sequence_length,
              divide(num_attention_heads, torch.distributed.get_world_size()),
              batch_size, batch_size)) < 0).cuda()

    optimizer = torch.optim.SGD(transformer_layer.parameters(), lr=0.01)

    profiler = Profiler(os.path.join('benchmark', args.exp_name))

    num_epochs = 5
    tot_time = 0
    nproc = torch.distributed.get_world_size()
    for epoch in range(num_epochs):
        input_ = torch.rand(size=embedding_output.size()).cuda()

        overall_name = f'transformer_layer_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        profiler.start(overall_name)

        fname = f'transformer_layer_forward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        # Forward pass
        profiler.start(fname)
        loss = transformer_layer.forward(input_, attention_mask)
        train_loss = torch.mean(loss)
        # print(train_loss)
        torch.cuda.synchronize()
        profiler.stop(fname)
        # Backward pass
        bname = f'transformer_layer_backward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        profiler.start(bname)
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        profiler.stop(bname)

        profiler.stop(overall_name)