def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training GPT2...')
    # Use fake train data
    batch_size = args.batch_size
    sequence_length = args.sequence_length
    hidden_size = args.hidden_size
    vocab_size = args.vocab_size
    dropout_prob = args.hidden_dropout

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=dropout_prob,
                          init_method=init_method_normal)

    optimizer = torch.optim.SGD(embedding.parameters(), lr=0.01)

    profiler = Profiler(os.path.join('benchmark', args.exp_name))

    num_epochs = 5
    tot_time = 0
    nproc = torch.distributed.get_world_size()

    for epoch in range(num_epochs):
        overall_name = f'emb_np-{nproc}_vs-{vocab_size}'
        profiler.start(overall_name)

        # Forward pass
        profiler.start(f'emb_forward_np-{nproc}_vs-{vocab_size}')
        embedding_output = embedding.forward(input_indices, position_indices)
        train_loss = torch.mean(embedding_output)
        torch.cuda.synchronize()
        profiler.stop(f'emb_forward_np-{nproc}_vs-{vocab_size}')

        # Backward pass
        profiler.start(f'emb_backward_np-{nproc}_vs-{vocab_size}')
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        profiler.stop(f'emb_backward_np-{nproc}_vs-{vocab_size}')

        profiler.stop(overall_name)
Esempio n. 2
0
def train(hidden_sizes, num_epochs=50):

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training MLP...')
    # Use MNIST data
    train_data = np.genfromtxt('data/digitstrain.txt', delimiter=",")
    train_X = torch.tensor(train_data[:, :-1],
                           dtype=torch.float,
                           device=torch.cuda.current_device())
    train_Y = torch.tensor(train_data[:, -1],
                           dtype=torch.int64,
                           device=torch.cuda.current_device())
    print_rank_0(f'train_X shape: {train_X.size()}')
    print_rank_0(f'train_Y shape: {train_Y.size()}')

    num_features = train_X.size()[1]
    num_classes = 10
    assert num_features == 28 * 28
    mlp = ParallelMLP(num_features=num_features,
                      num_classes=num_classes,
                      hidden_sizes=hidden_sizes)
    print_rank_0('AutoMP: Successfully initialized ParallelMLP')

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(mlp.parameters(), lr=0.01)

    # num_epochs = 500
    num_train_samples = train_X.size()[0]
    batch_size = num_train_samples
    tot_time = 0
    for epoch in range(num_epochs):
        start_time = time.time()
        train_loss = 0
        for sample_idx in range(0, num_train_samples, batch_size):
            mini_batch = train_X[sample_idx:sample_idx + batch_size, ...]
            labels = train_Y[sample_idx:sample_idx + batch_size]
            # Forward pass
            logits = mlp(mini_batch)
            # Note: torch.nn.CrossEntropyLoss does not need one hot encoding
            loss = criterion(logits, labels)
            # loss = parallel_cross_entropy(logits, labels)
            train_loss += loss
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_loss /= (num_train_samples / batch_size)
        # if epoch % 50 == 0:
        print_rank_0(
            f'Epoch Number {epoch}: train loss: {train_loss}, time: {time.time()-start_time}'
        )
        tot_time += time.time() - start_time
    print_rank_0(f'!!! AVG EPOCH TIME: {tot_time/num_epochs}')
Esempio n. 3
0
def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training GPT2...')
    # Use fake train data
    args = get_args()
    sequence_length = 1024
    vocab_size = 4096
    dropout_prob = 0.1

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(args.batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (args.batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=args.hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=dropout_prob,
                          init_method=init_method_normal)

    embedding_output = embedding.forward(input_indices, position_indices)

    # print_rank_0(f'AutoMP: embedding_output = {embedding_output}')

    def gpt2_attention_mask_func(attention_scores, ltor_mask):
        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    transformer = ParallelTransformer(
        attention_mask_func=gpt2_attention_mask_func,
        num_layers=args.num_layers,
        hidden_size=args.hidden_size,
        layernorm_epsilon=args.layernorm_epsilon,
        num_attention_heads=args.num_attention_heads,
        attention_dropout=0.1,
        hidden_dropout=0.1)

    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
        input_indices, vocab_size - 1)

    transformer_output = transformer.forward(hidden_states=embedding_output,
                                             attention_mask=attention_mask)
    print_rank_0(f'AutoMP: transformer_output = {transformer_output}')
def train():
    
    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training self attention layer...')
    # Use fake train data
    args = get_args()
    batch_size = 32
    sequence_length = 1024
    hidden_size = args.hidden_size
    vocab_size = 4096
    dropout_prob = 0.1

    input_indices = torch.randint(low=0, high=vocab_size, size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length), (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())
    print_rank_0(f'AutoMP: input_indices shape = {input_indices.size()}')
    print_rank_0(f'AutoMP: position_indices shape = {position_indices.size()}')

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)
    embedding = Embedding(hidden_size=hidden_size, 
              vocab_size=vocab_size, 
              max_sequence_length=sequence_length, 
              embedding_dropout_prob=dropout_prob, 
              init_method=init_method_normal)

    embedding_output = embedding.forward(input_indices, position_indices)
    # print_rank_0(f'AutoMP: embedding_output = {embedding_output}')

    def gpt2_attention_mask_func(attention_scores, ltor_mask):

        print(f'ALBERT_DEBUG: attention_scores.size() = {attention_scores.size()}')
        print(f'ALBERT_DEBUG: ltor_mask.size() = {ltor_mask.size()}')

        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    self_attention = ParallelSelfAttention(
        attention_mask_func=gpt2_attention_mask_func, 
        hidden_size=args.hidden_size, 
        num_attention_heads=args.num_attention_heads, 
        attention_dropout=0.1
    )

    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1)

    print(f'ALBERT_DEBUG: embedding_output.size() = {embedding_output.size()}')

    self_att_output = self_attention.forward(hidden_states=embedding_output, attention_mask=attention_mask)
    print_rank_0(f'AutoMP: self_att_output = {self_att_output}')
Esempio n. 5
0
def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training GPT2...')

    batch_size = args.batch_size
    sequence_length = args.sequence_length
    hidden_size = args.hidden_size
    vocab_size = args.vocab_size
    hidden_dropout = args.hidden_dropout
    attention_dropout = args.attention_dropout
    num_layers = args.num_layers
    layernorm_epsilon = args.layernorm_epsilon
    num_attention_heads = args.num_attention_heads

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    labels = torch.randint(low=0,
                           high=vocab_size,
                           size=(batch_size, sequence_length))
    labels = labels.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    def gpt2_attention_mask_func(attention_scores, ltor_mask):
        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    gpt2 = GPT2(
        hidden_size,
        vocab_size,
        sequence_length,
        hidden_dropout,
        gpt2_attention_mask_func,
        num_layers,
        layernorm_epsilon,
        num_attention_heads,
        attention_dropout,
        init_method_normal,
    )
    num_params = sum(p.numel() for p in gpt2.parameters() if p.requires_grad)

    attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
        input_indices, vocab_size - 1)

    optimizer = torch.optim.SGD(gpt2.parameters(), lr=0.01)

    profiler = Profiler(os.path.join('benchmark', args.exp_name))

    num_epochs = 5
    tot_time = 0
    nproc = torch.distributed.get_world_size()

    for epoch in range(num_epochs):
        overall_name = f'gpt2_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}_num-params-{num_params}'
        profiler.start(overall_name)

        fname = f'gpt2_forward_np-{nproc}_hs-{hidden_size}_nl-{num_layers}_nah-{num_attention_heads}_bsz-{batch_size}_num-params-{num_params}'
        # Forward pass
        profiler.start(fname)
        loss = gpt2.forward(input_indices, position_indices, attention_mask,
                            labels)
        train_loss = torch.mean(loss)
        # print(train_loss)
        torch.cuda.synchronize()
        profiler.stop(fname)
        # Backward pass
        bname = f'gpt2_backward_np-{nproc}_hs-{hidden_size}_nl-{num_layers}_nah-{num_attention_heads}_bsz-{batch_size}_num-params-{num_params}'
        profiler.start(bname)
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        profiler.stop(bname)

        profiler.stop(overall_name)
def train():

    # Initialize torch.distributed
    init_distributed()

    print_rank_0('AutoMP: training ParallelTransformerLayer...')

    batch_size = args.batch_size
    sequence_length = args.sequence_length
    hidden_size = args.hidden_size
    vocab_size = args.vocab_size
    hidden_dropout = args.hidden_dropout
    attention_dropout = args.attention_dropout
    num_layers = args.num_layers
    layernorm_epsilon = args.layernorm_epsilon
    num_attention_heads = args.num_attention_heads

    input_indices = torch.randint(low=0,
                                  high=vocab_size,
                                  size=(batch_size, sequence_length))
    input_indices = input_indices.to(torch.cuda.current_device())
    labels = torch.randint(low=0,
                           high=vocab_size,
                           size=(batch_size, sequence_length))
    labels = labels.to(torch.cuda.current_device())
    position_indices = torch.tile(torch.arange(start=0, end=sequence_length),
                                  (batch_size, 1))
    position_indices = position_indices.to(torch.cuda.current_device())

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    def gpt2_attention_mask_func(attention_scores, ltor_mask):
        attention_scores.masked_fill_(ltor_mask, -10000.0)
        return attention_scores

    def init_method_normal(tensor):
        return torch.nn.init.normal_(tensor, mean=0.0, std=1.0)

    embedding = Embedding(hidden_size=hidden_size,
                          vocab_size=vocab_size,
                          max_sequence_length=sequence_length,
                          embedding_dropout_prob=hidden_dropout,
                          init_method=init_method_normal)
    embedding_output = embedding.forward(input_indices, position_indices)

    transformer_layer = ParallelTransformerLayer(
        attention_mask_func=gpt2_attention_mask_func,
        layer_number=0,
        hidden_size=hidden_size,
        layernorm_epsilon=layernorm_epsilon,
        num_attention_heads=num_attention_heads,
        attention_dropout=attention_dropout,
        hidden_dropout=hidden_dropout)

    # attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(input_indices, vocab_size - 1)
    attention_mask = (torch.randint(
        low=0,
        high=2,
        size=(sequence_length,
              divide(num_attention_heads, torch.distributed.get_world_size()),
              batch_size, batch_size)) < 0).cuda()

    optimizer = torch.optim.SGD(transformer_layer.parameters(), lr=0.01)

    profiler = Profiler(os.path.join('benchmark', args.exp_name))

    num_epochs = 5
    tot_time = 0
    nproc = torch.distributed.get_world_size()
    for epoch in range(num_epochs):
        input_ = torch.rand(size=embedding_output.size()).cuda()

        overall_name = f'transformer_layer_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        profiler.start(overall_name)

        fname = f'transformer_layer_forward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        # Forward pass
        profiler.start(fname)
        loss = transformer_layer.forward(input_, attention_mask)
        train_loss = torch.mean(loss)
        # print(train_loss)
        torch.cuda.synchronize()
        profiler.stop(fname)
        # Backward pass
        bname = f'transformer_layer_backward_np-{nproc}_hs-{hidden_size}_nah-{num_attention_heads}_bsz-{batch_size}'
        profiler.start(bname)
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        torch.cuda.synchronize()
        profiler.stop(bname)

        profiler.stop(overall_name)