Esempio n. 1
0
    def _test_reduce_multigpu_helper(
        self,
        group,
        group_id,
        rank,
        rank_to_GPU,
        op,
        master_value,
        worker_value,
        expected_value,
    ):
        for src in group:
            if rank == src:
                tensors = [
                    _build_tensor(src + 1, master_value).cuda(device=i)
                    for i in rank_to_GPU[rank]
                ]
                dist.reduce_multigpu(tensors, src, op, group_id)
                expected_tensor = _build_tensor(src + 1, expected_value)
                self.assertEqual(tensors[0], expected_tensor)
            else:
                tensors = [
                    _build_tensor(src + 1, worker_value).cuda(device=i)
                    for i in rank_to_GPU[rank]
                ]
                dist.reduce_multigpu(tensors, src, op, group_id)

        self._barrier()
Esempio n. 2
0
    def _test_reduce_multigpu_helper(self, group, group_id, rank,
                                     rank_to_GPU, op, master_value,
                                     worker_value, expected_value):
        for src in group:
            if rank == src:
                tensors = [_build_tensor(src + 1, master_value).cuda(device=i)
                           for i in rank_to_GPU[rank]]
                dist.reduce_multigpu(tensors, src, op, group_id)
                expected_tensor = _build_tensor(src + 1, expected_value)
                self.assertEqual(tensors[0], expected_tensor)
            else:
                tensors = [_build_tensor(src + 1, worker_value).cuda(device=i)
                           for i in rank_to_GPU[rank]]
                dist.reduce_multigpu(tensors, src, op, group_id)

        self._barrier()
Esempio n. 3
0
 def reduce_multigpu(self,
                     tensor_list,
                     dst,
                     op=dist.ReduceOp.SUM,
                     async_op=False,
                     dst_tensor=0):  # pragma: no cover
     return dist.reduce_multigpu(tensor_list, dst, op, self.group, async_op,
                                 dst_tensor)
Esempio n. 4
0
def train(
    *,
    rank,
    gpu,
    task,
    model,
    train_loader,
    loss_func,
    optimizer,
    pad,
    epoch,
    epochs,
    print_freq,
    distributed,
    world_size,
):
    """Train with given data.

    Args:
        rank: rank of current process
        gpu: GPU id to use
        task: task among 'regression', 'classification' and 'both'
        model: trained model
        train_loader : dataloader
        loss_func : Loss function
        optimizer : Optimization object
        pad : Padding
        epoch : Current epoch
        epochs : Total epochs to train for
        print_freq : How frequently to print training information.
        distributed : Distributed training
        world_size : World size

    """
    num_batches = len(train_loader)
    epoch_formatter = "Epoch " + \
                      equal_width_formatter(total=epochs).format(epoch)
    start = time.time()
    forward_time = 0.
    backward_time = 0.
    print_time = 0.

    model.train()

    print('Num_batches %d; rank %s, gpu %s' %
          (num_batches, str(rank), str(gpu)))

    # Loop training data
    for i, batch in enumerate(train_loader):
        x = batch['input']
        y_reg = batch['label_reg']
        y_cla = batch['label_cla']

        # move data and labels to GPU for forward pass
        if len(x.shape) == 2:
            x = x.unsqueeze(1)  # (N, 1, L)
        else:
            x = np.swapaxes(x, 1, 2)
        x = x.cuda(gpu, non_blocking=True)

        if task == 'regression':
            y = y_reg.cuda(gpu, non_blocking=True)
        elif task == 'classification':
            y = y_cla.cuda(gpu, non_blocking=True)
        elif task == 'both':
            y_reg = y_reg.cuda(gpu, non_blocking=True)
            y_cla = y_cla.cuda(gpu, non_blocking=True)

        # Model forward pass
        t = time.time()
        pred = model(x)

        # Remove padding
        if pad is not None:
            center = range(pad, x.shape[2] - pad)
            if task == 'regression' or task == 'classification':
                y = y[:, center]
                pred = pred[:, center]
            elif task == 'both':
                y_reg = y_reg[:, center]
                y_cla = y_cla[:, center]
                pred = [x[:, center] for x in pred]

        # Calculate losses
        if task == 'regression' or task == 'classification':
            total_loss_value, losses_values = loss_func(pred, y)
        elif task == 'both':
            total_loss_value_reg, losses_values_reg = loss_func[0](pred[0],
                                                                   y_reg)
            total_loss_value_cla, losses_values_cla = loss_func[1](pred[1],
                                                                   y_cla)
            # Combine loss values
            losses_values = losses_values_reg.copy()
            losses_values.update(losses_values_cla)
            # Combine total loss
            total_loss_value = total_loss_value_reg + total_loss_value_cla
            losses_values['total_loss'] = total_loss_value

        forward_time += time.time() - t

        # one gradient descent step
        optimizer.zero_grad()
        t = time.time()
        total_loss_value.backward()
        optimizer.step()
        backward_time += time.time() - t

        # Loss is only reduced every X batches?
        if (i % print_freq == 0) or (i == num_batches - 1):
            t = time.time()
            if (dist.is_initialized()):
                for loss_type, value in losses_values.items():
                    # update inplace, ReduceOp=SUM
                    dist.reduce_multigpu([value], dst=0)
                    losses_values[loss_type] = value / world_size

            if rank == 0:
                post_bar_msg = " | ".join([
                    k + ':{:8.3f}'.format(v.cpu().item())
                    for k, v in losses_values.items()
                ])
                progbar(curr=i,
                        total=num_batches,
                        progbar_len=20,
                        pre_bar_msg=epoch_formatter,
                        post_bar_msg=post_bar_msg)
            print_time += time.time() - t

    myprint(epoch_formatter +
            " Time Taken: {:7.3f}s".format(time.time() - start),
            color='yellow',
            rank=rank)

    # Time breakdown for the epoch...
    total_time = time.time() - start
    remainder_time = total_time - forward_time - backward_time - print_time
    print(
        'Total train time: %.3f\tFor time: %.3f\tBack time: %.3f\tPrint '
        'time: %.3f\tRemain (data) time: %.3f' %
        (total_time, forward_time, backward_time, print_time, remainder_time))
Esempio n. 5
0
# all tensors become 1*2*3*4
print('{} AFTER all_reduce_multigpu {}'.format(local_rank, tensor_list))
if local_rank == 0:
    assert_mean(tensor_list[0], 24.)
    assert_mean(tensor_list[1], 24.)
else:
    assert_mean(tensor_list[0], 24.)
    assert_mean(tensor_list[1], 24.)

# ---------------- REDUCE -----------------
tensor_list = get_tensor_list()

# Only the GPU of tensor_list[0] on the process with rank dst is going to
# receive the final result.
dist.reduce_multigpu(
    tensor_list,
    dst=1,  # destination process rank
    op=dist.reduce_op.PRODUCT)
print('{} AFTER reduce_multigpu {}'.format(local_rank, tensor_list))
if local_rank == 0:
    assert_mean(tensor_list[0], 1.)
    assert_mean(tensor_list[1], 2.)
else:
    assert_mean(tensor_list[0], 24.)
    assert_mean(tensor_list[1], 4.)

# ---------------- BROADCAST -----------------
tensor_list = get_tensor_list()

dist.broadcast_multigpu(
    tensor_list,
    src=1,  # rank 1 tensor_list[0] broadcast to all
Esempio n. 6
0
def train(*, rank, gpu, task, model, train_loader, loss_func, optimizer, pad,
          epoch, epochs, clip_grad, print_freq, distributed, world_size):

    num_batches = len(train_loader)
    epoch_formatter = "Epoch " + \
        equal_width_formatter(total=epochs).format(epoch)
    start = time.time()
    forward_time = 0.
    backward_time = 0.
    print_time = 0.

    model.train()

    print('Num_batches %d; rank %s, gpu %s' %
          (num_batches, str(rank), str(gpu)))

    # Loop training data
    for i, batch in enumerate(train_loader):
        x = batch['x']
        y_reg = batch['y_reg']
        y_cla = batch['y_cla']
        # model forward pass
        x = x.unsqueeze(1)  # (N, 1, L)
        x = x.cuda(gpu, non_blocking=True)

        if task == 'regression':
            y = y_reg.cuda(gpu, non_blocking=True)
        elif task == 'classification':
            y = y_cla.cuda(gpu, non_blocking=True)
        elif task == 'both':
            y_reg = y_reg.cuda(gpu, non_blocking=True)
            y_cla = y_cla.cuda(gpu, non_blocking=True)

        t = time.time()
        pred = model(x)

        # Remove padding
        if pad is not None:
            center = range(pad, x.shape[2] - pad)
            if task == 'regression' or task == 'classification':
                y = y[:, center]
                pred = pred[:, center]
            elif task == 'both':
                y_reg = y_reg[:, center]
                y_cla = y_cla[:, center]
                pred = [x[:, center] for x in pred]

        # Calculate losses
        if task == 'regression' or task == 'classification':
            total_loss_value, losses_values = loss_func(pred, y)
        elif task == 'both':
            total_loss_value_reg, losses_values_reg = loss_func[0](pred[0],
                                                                   y_reg)
            total_loss_value_cla, losses_values_cla = loss_func[1](pred[1],
                                                                   y_cla)
            # Combine loss values
            losses_values = losses_values_reg.copy()
            losses_values.update(losses_values_cla)
            # Combine total loss
            total_loss_value = total_loss_value_reg + total_loss_value_cla
            losses_values['total_loss'] = total_loss_value

        forward_time += time.time() - t

        # one gradient descent step
        optimizer.zero_grad()
        t = time.time()
        total_loss_value.backward()
        if clip_grad > 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_grad)
        optimizer.step()
        backward_time += time.time() - t

        # Loss is only reduced every X batches?
        if (i % print_freq == 0) or (i == num_batches - 1):
            t = time.time()
            if (dist.is_initialized()):
                for loss_type, value in losses_values.items():
                    # update inplace, ReduceOp=SUM
                    dist.reduce_multigpu([value], dst=0)
                    losses_values[loss_type] = value / world_size

            if rank == 0:
                post_bar_msg = " | ".join([
                    k + ':{:8.3f}'.format(v.cpu().item())
                    for k, v in losses_values.items()
                ])
                progbar(curr=i,
                        total=num_batches,
                        progbar_len=20,
                        pre_bar_msg=epoch_formatter,
                        post_bar_msg=post_bar_msg)
            print_time += time.time() - t

    myprint(epoch_formatter +
            " Time Taken: {:7.3f}s".format(time.time() - start),
            color='yellow',
            rank=rank)

    # Time breakdown for the epoch...
    total_time = time.time() - start
    remainder_time = total_time - forward_time - backward_time - print_time