Ejemplo n.º 1
0
def reduce_dict_hvd(input_dict, average=True):
    """
    Args:
        input_dict (dict): all the values will be reduced
        average (bool): whether to do average or sum
    Reduce the values in the dictionary from all processes so that all processes
    have the averaged results. Returns a dict with the same fields as
    input_dict, after reduction.
    """
    global _USE_HVD
    world_size = get_world_size()
    if world_size < 2:
        return input_dict
    with torch.no_grad():
        names = []
        values = []
        # sort the keys so that they are consistent across processes
        for k in sorted(input_dict.keys()):
            names.append(k)
            values.append(input_dict[k])
        values = torch.stack(values, dim=0)
        if _USE_HVD:  # TODO: check this in hvd
            hvd.allreduce_(values,
                           op=hvd.Average if average else hvd.Adasum,
                           name="reduce_dict")
        else:
            dist.all_reduce(values)
            if average:
                values /= world_size
        reduced_dict = {k: v for k, v in zip(names, values)}
    return reduced_dict
Ejemplo n.º 2
0
    def test_horovod_allreduce_inplace(self):
        """Test that the allreduce correctly sums 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        dtypes = [torch.IntTensor, torch.LongTensor,
                  torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor,
                       torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            tensor = tensor.type(dtype)
            multiplied = tensor * size
            hvd.allreduce_(tensor, average=False)
            max_difference = tensor.sub(multiplied).max()

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor,
                                      torch.cuda.IntTensor, torch.cuda.LongTensor]:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
Ejemplo n.º 3
0
def all_reduce_and_rescale_tensors(tensors, rescale_denom):
    """All-reduce and rescale tensors at once (as a flattened tensor)

    Args:
        tensors: list of Tensors to all-reduce
        rescale_denom: denominator for rescaling summed Tensors
    """
    # buffer size in bytes, determine equiv. # of elements based on data type
    sz = sum(t.numel() for t in tensors)
    buffer_t = tensors[0].new(sz).zero_()

    # copy tensors into buffer_t
    offset = 0
    for t in tensors:
        numel = t.numel()
        buffer_t[offset:offset + numel].copy_(t.view(-1))
        offset += numel

    # all-reduce and rescale
    hvd.allreduce_(buffer_t[:offset])
    buffer_t.div_(rescale_denom)

    # copy all-reduced buffer back into tensors
    offset = 0
    for t in tensors:
        numel = t.numel()
        t.view(-1).copy_(buffer_t[offset:offset + numel])
        offset += numel
Ejemplo n.º 4
0
    def test_horovod_allreduce_inplace(self):
        """Test that the allreduce correctly sums 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        dtypes = [torch.IntTensor, torch.LongTensor,
                  torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor,
                       torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            tensor = tensor.type(dtype)
            multiplied = tensor * size
            hvd.allreduce_(tensor, average=False)
            max_difference = tensor.sub(multiplied).max()

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor,
                                      torch.cuda.IntTensor, torch.cuda.LongTensor]:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
Ejemplo n.º 5
0
  def test_stability(self):
    hvd.init()
    # TODO support non-MPI Adasum operation
    if not hvd.mpi_enabled():
      self.skipTest("MPI not enabled")

    device = torch.device('cuda:{}'.format(hvd.local_rank())) if torch.cuda.is_available() else torch.device('cpu')
    np.random.seed(2)
    torch.manual_seed(2)
    size = hvd.size()
    local_size = hvd.local_size()
    rank = hvd.rank()

    for data_type in self.data_types:
      N = 1024
      a = np.random.normal(0, np.finfo(data_type).tiny, (N, 1)).astype(np.float64)
      r = np.random.normal(0, 1, (size, 1)).astype(np.float64)
      q = np.dot(a,r.T).astype(data_type).astype(np.float64)
      tensor = np.zeros(N,dtype=data_type)
      tensor[:] = q[:,hvd.rank()]

      tensor = torch.from_numpy(tensor).to(device)

      hvd.allreduce_(tensor, op=hvd.Adasum)

      expected = np.sum(q,axis=1) / size
      comp = self.are_close(data_type, expected, tensor.cpu().numpy()) 
      if comp:
        print('Stability test passed')
      else:
        print('computed: ', tensor)
        print('expected: ', expected)
        print('off by: ', self.diff_ratio(expected,tensor.cpu().numpy()))
      assert comp
Ejemplo n.º 6
0
def all_reduce_and_rescale_tensors_chunked(tensors,
                                           rescale_denom,
                                           buffer_size=10485760):
    """All-reduce and rescale tensors in chunks of the specified size.

    Args:
        tensors: list of Tensors to all-reduce
        rescale_denom: denominator for rescaling summed Tensors
        buffer_size: all-reduce chunk size in bytes
    """
    # buffer size in bytes, determine equiv. # of elements based on data type
    buffer_t = tensors[0].new(
        math.ceil(buffer_size / tensors[0].element_size())).zero_()
    buffer = []

    def all_reduce_buffer():
        # copy tensors into buffer_t
        offset = 0
        for t in buffer:
            numel = t.numel()
            buffer_t[offset:offset + numel].copy_(t.view(-1))
            offset += numel

        # all-reduce and rescale
        hvd.allreduce_(buffer_t[:offset])
        buffer_t.div_(rescale_denom)

        # copy all-reduced buffer back into tensors
        offset = 0
        for t in buffer:
            numel = t.numel()
            t.view(-1).copy_(buffer_t[offset:offset + numel])
            offset += numel

    filled = 0
    for t in tensors:
        sz = t.numel() * t.element_size()
        if sz > buffer_size:
            # tensor is bigger than buffer, all-reduce and rescale directly
            hvd.allreduce_(t)
            t.div_(rescale_denom)
        elif filled + sz > buffer_size:
            # buffer is full, all-reduce and replace buffer with grad
            all_reduce_buffer()
            buffer = [t]
            filled = sz
        else:
            # add tensor to buffer
            buffer.append(t)
            filled += sz

    if len(buffer) > 0:
        all_reduce_buffer()
def clip_grad_norm_2_by_global_(grad, max_norm, name=None):
    max_norm = float(max_norm)
    grad_square_sum = torch.sum(grad.square())
    total_norm = torch.sqrt(allreduce_(grad_square_sum, average=True, name=name))
    clip_coef = max_norm / (total_norm + 1e-6)
    if clip_coef < 1:
        grad.data.mul_(clip_coef)
    return grad
Ejemplo n.º 8
0
    def all_reduce_buffer():
        # copy tensors into buffer_t
        offset = 0
        for t in buffer:
            numel = t.numel()
            buffer_t[offset:offset + numel].copy_(t.view(-1))
            offset += numel

        # all-reduce and rescale
        hvd.allreduce_(buffer_t[:offset])
        buffer_t.div_(rescale_denom)

        # copy all-reduced buffer back into tensors
        offset = 0
        for t in buffer:
            numel = t.numel()
            t.view(-1).copy_(buffer_t[offset:offset + numel])
            offset += numel
Ejemplo n.º 9
0
    def test_stability_2(self):
        hvd.init()
        # TODO support non-MPI Adasum operation
        if not hvd.mpi_enabled():
            return
        device = torch.device('cuda:{}'.format(hvd.local_rank(
        ))) if torch.cuda.is_available() else torch.device('cpu')
        np.random.seed(2)
        torch.manual_seed(2)
        size = hvd.size()
        local_size = hvd.local_size()
        rank = hvd.rank()

        for data_type in self.data_types:
            N = 1024
            dt_min = np.finfo(data_type).tiny.astype(np.float64)
            dt_max = math.sqrt(np.finfo(data_type).max.astype(np.float64))
            a = np.random.normal(0, 1, (N, 1)).astype(np.float64)
            r = np.array([
                dt_max**(float(i + 1) / float(size)) *
                dt_min**(float(size - i - 1) / float(size))
                for i in range(size)
            ]).reshape(size, 1).astype(np.float64)
            np.random.shuffle(r)
            q = np.dot(a, r.T).astype(data_type).astype(np.float64)
            tensor = np.zeros(N, dtype=data_type)
            tensor[:] = q[:, hvd.rank()]

            tensor = torch.from_numpy(tensor).to(device)

            hvd.allreduce_(tensor, op=hvd.Adasum)

            expected = np.sum(q, axis=1) / size
            comp = self.are_close(data_type, expected, tensor.cpu().numpy())
            if comp:
                print('Stability 2 test passed')
            else:
                print('computed: ', tensor)
                print('expected: ', expected)
                print('off by: ',
                      self.diff_ratio(expected,
                                      tensor.cpu().numpy()))
            assert comp
Ejemplo n.º 10
0
    def compress(self, tensor, name=""):
        if tensor.dim() == 1:
            return [tensor], None

        shape = tensor.size()
        matrix = tensor.view([shape[0], -1])
        q = self.q_memory[name]
        # q, _ = torch.qr(q)
        orthogonalize(q)

        p = torch.mm(matrix, q)
        p = allreduce_(p)
        # p, _ = torch.qr(p)
        orthogonalize(p)
        q = torch.mm(matrix.t(), p)
        q = allreduce_(q)
        ctx = p, q, shape
        self.q_memory[name] = q
        return [], ctx
Ejemplo n.º 11
0
    def test_horovod_allreduce_multi_gpu(self):
        """Test that the allreduce works on multiple GPUs."""
        # Only do this test if there are GPUs available.
        if not torch.cuda.is_available():
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        iter = 0
        dtypes = [
            torch.cuda.IntTensor, torch.cuda.LongTensor,
            torch.cuda.FloatTensor, torch.cuda.DoubleTensor
        ]
        if _fp16_supported:
            dtypes += [torch.cuda.HalfTensor]

        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            iter += 1
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            device = local_rank * 2 + (iter + local_rank) % 2
            tensor = tensor.cuda(device).type(dtype)
            multiplied = tensor * size
            hvd.allreduce_(tensor, average=False)
            max_difference = tensor.sub(multiplied).max()

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in [
                    torch.cuda.IntTensor, torch.cuda.LongTensor
            ]:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
Ejemplo n.º 12
0
    def test_horovod_allreduce_multi_gpu(self):
        """Test that the allreduce works on multiple GPUs."""
        # Only do this test if there are GPUs available.
        if not torch.cuda.is_available():
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        iter = 0
        dtypes = [torch.cuda.IntTensor, torch.cuda.LongTensor,
                  torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            iter += 1
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            device = local_rank * 2 + (iter + local_rank) % 2
            tensor = tensor.cuda(device).type(dtype)
            multiplied = tensor * size
            hvd.allreduce_(tensor, average=False)
            max_difference = tensor.sub(multiplied).max()

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in [torch.cuda.IntTensor, torch.cuda.LongTensor]:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
Ejemplo n.º 13
0
    def compensate(self, tensor, name):
        """Update the tensor with the residuals."""
        # https://github.com/synxlin/deep-gradient-compression/blob/master/dgc/memory.py
        grad = self.get_grad(name)
        if self.gradient_clipping:
            tensor_squ_sum = torch.sum(grad * grad)
            clipping_val = torch.sqrt(allreduce_(tensor_squ_sum, average=True, name=name))
            grad = grad.clamp(-clipping_val, clipping_val)
        mmt = self.get_momentum(name)
        vec = self.get_velocity(name)

        if self.momentum_masking:
            mmt.mul_(self.momentum).add_(grad)
            vec.add_(mmt)
        else:
            vec.mul_(self.momentum).add_(grad)
Ejemplo n.º 14
0
 def compensate(self, tensor, name):
     """Update the tensor with the residuals."""
     if self.gradient_clipping:
         tensor_squ_sum = torch.sum(tensor * tensor)
         clipping_val = torch.sqrt(
             allreduce_(tensor_squ_sum, average=True, name=name))
         tensor = tensor.clamp(-clipping_val, clipping_val)
     if name in self.residuals:
         self.residuals[
             name] = self.momentum * self.residuals[name] + tensor
     else:
         self.residuals[name] = tensor
     if name in self.gradients:
         self.gradients[name] += self.residuals[name]
         tensor = self.gradients[name]
     else:
         self.gradients[name] = tensor
     return tensor
Ejemplo n.º 15
0
 def array_reduce_(arr: Array, average: bool = True) -> None:
     t = torch.from_numpy(arr)
     hvd.allreduce_(t, average=average)
Ejemplo n.º 16
0
        model_parameters = filter(lambda p: p.requires_grad, model.parameters())
        num_parameters = sum([np.prod(p.size()) for p in model_parameters])
        print(f'Number of trainable parameters in model: {num_parameters}')
        model.logger.add_text(f'hyperparams', '{num_parameters}', 0)

    if root_process:
        print("Load data")

    # get dataset for training and testing of the model
    if root_process:
        train_set = datasets.MNIST(root="data/mnist", train=True, transform=transform_ops, download=True)
        test_set = datasets.MNIST(root="data/mnist", train=False, transform=transform_ops, download=True)

    # if distributed over multiple GPU's, set-up barrier a barrier ensuring that all the processes have loaded the data
    if distributed:
        hvd.allreduce_(torch.Tensor(0), name='barrier')

    # get dataset for training and testing of the model
    if not root_process:
        train_set = datasets.MNIST(root="data/mnist", train=True, transform=transform_ops, download=True)
        test_set = datasets.MNIST(root="data/mnist", train=True, transform=transform_ops, download=True)

    # setup data sampler
    if distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_set, num_replicas=hvd.size(), rank=hvd.rank())
        test_sampler = torch.utils.data.distributed.DistributedSampler(
            test_set, num_replicas=hvd.size(), rank=hvd.rank())

    # setup mini-batch enumerator for both train-set and test-set
    train_loader = torch.utils.data.DataLoader(
Ejemplo n.º 17
0
        model.train()
        output = model(data)
        loss = criterion(output, target)
        large_batch_loss += loss.item()
        loss.backward()
        if inner_loop % large_ratio == 0:
            num_updates += 1
            optimizer.step()
            optimizer.zero_grad()
            if num_updates % args.comm_interval == args.comm_interval - 1:
                allreduce_parameters(model.state_dict())
            if batch_idx * large_ratio % 25 == 0:
                print('Train Epoch: {} [{}/{}]\tLoss: {}'.format(epoch, batch_idx * len(data), len(train_sampler), large_batch_loss))
                cur_batch_loss = torch.FloatTensor([loss.item()])
                hvd.allreduce_(cur_batch_loss)
                cur_batch_loss = float(cur_batch_loss)
                train_losses.append((time.clock() - start_time, epoch, batch_idx, cur_batch_loss))
            large_batch_loss = 0
        if batch_idx % 100 == 0:
            model.eval()
            try:
                inputs, labels = next(testset_iterator)
            except StopIteration:
                testset_iterator = iter(test_loader)
                inputs, labels = next(testset_iterator)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            accuracy = outputs.data.max(1)[1].eq(labels).sum().item() / outputs.data.shape[0]

            loss = metric_average(loss, 'avg_loss')
Ejemplo n.º 18
0
def forward(data_loader,
            model,
            criterion,
            epoch=0,
            training=True,
            optimizer=None,
            U=None,
            V=None):
    # hvd
    # if args.gpus and len(args.gpus) > 1:
    #    model = torch.nn.DataParallel(model, args.gpus)

    batch_time = AverageMeter()
    pruning_time = AverageMeter()
    select_time = AverageMeter()
    comm_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    top1 = AverageMeter()
    top5 = AverageMeter()

    end = time.time()

    masks = [torch.zeros(w.size()).cuda() for w in list(model.parameters())]

    for i, (inputs, target) in enumerate(data_loader):
        # measure data loading time
        data_time.update(time.time() - end)
        if args.gpus is not None:
            target = target.cuda(async=True)
        input_var = Variable(inputs.type(args.type), volatile=not training)
        target_var = Variable(target)

        # compute output
        if not training:
            output = model(input_var)
            loss = criterion(output, target_var)

            # measure accuracy and record loss
            prec1, prec5 = accuracy(output.data, target_var.data, topk=(1, 5))
            losses.update(loss.data[0], input_var.size(0))
            top1.update(prec1[0], input_var.size(0))
            top5.update(prec5[0], input_var.size(0))

        else:

            # mini_inputs = input_var.chunk(args.batch_size // args.mini_batch_size)
            # mini_targets = target_var.chunk(args.batch_size // args.mini_batch_size)

            #TODO for debug shoul be delete
            optimizer.zero_grad()

            # fjr simulate distributed senario
            # acc_grad = []
            # if torch.cuda.is_available():
            #     acc_grad = [torch.zeros(w.size()).cuda() for w in list(model.parameters())]
            # else:
            #     print("gpu is not avaiable for acc_grad allocation")

            # for k, mini_input_var in enumerate(mini_inputs):
            output = model(input_var)
            loss = criterion(output, target_var)

            prec1, prec5 = accuracy(output.data, target_var.data, topk=(1, 5))
            losses.update(loss.data[0], input_var.size(0))
            top1.update(prec1[0], input_var.size(0))
            top5.update(prec5[0], input_var.size(0))

            loss.backward()

            if args.use_pruning:
                clip_grad_norm(model.parameters(), 5. * (hvd.size()**-0.5))

            idx = 0
            for u, v, p in zip(U, V, model.parameters()):
                prune_begin = time.time()
                if args.use_pruning:
                    # TODO how to set rho (momentum)
                    g = p.grad.data / hvd.size()
                    g += p.data * args.weight_decay / hvd.size()
                    if args.use_nesterov:
                        u = args.momentum * (u + g)
                        v = v + u + g
                    else:
                        u = args.momentum * u + g
                        v = v + u

                    select_begin = time.time()
                    ratio = 1 - 0.999
                    if args.use_sync and i % args.sync_interval == 0:
                        masks[idx] = 1
                    else:
                        if args.use_warmup:
                            # print("iter", i, "node ", k, " pruning layer ", idx)
                            if (epoch == 0):
                                ratio = 1 - 0.75
                            elif (epoch == 1):
                                ratio = 1 - 0.9375
                            elif (epoch == 2):
                                ratio = 1 - 0.984375
                            elif (epoch == 3):
                                ratio = 1 - 0.996
                            else:
                                ratio = 1 - 0.999
                        else:
                            ratio = 1 - 0.999
                        #masks[idx], compressed_val, compressed_idx = select_top_k(v, ratio, masks[idx])
                        masks[
                            idx], compressed_val, compressed_idx = select_top_k_appr(
                                v, ratio, masks[idx])
                    select_time.update(time.time() - select_begin)

                    # TODO check compress
                    p_tmp = v * masks[idx]
                    g_ref = hvd.allreduce(p_tmp, average=False)

                    v = v * (1 - masks[idx])
                    u = u * (1 - masks[idx])

                    comm_begin = time.time()
                    g_size = p.grad.data.size()
                    msg_size = len(compressed_val)
                    # print("compressed_val size is, ", msg_size)
                    gathered_val = hvd.allgather(compressed_val)
                    gathered_idx = hvd.allgather(compressed_idx)
                    p.grad.data = p.grad.data.view(-1)
                    p.grad.data.zero_()
                    # print("gathered_val size is, ", len(gathered_val))
                    # print("val", gathered_val)
                    # print("idx", gathered_idx)
                    for node_idx in range(hvd.size()):
                        p.grad.data[gathered_idx[node_idx *
                                                 msg_size:(node_idx + 1) *
                                                 msg_size]] += gathered_val[
                                                     node_idx *
                                                     msg_size:(node_idx + 1) *
                                                     msg_size]
                    p.grad.data = p.grad.data.view(g_size)

                    comm_time.update(time.time() - comm_begin)

                    U[idx] = u  #new_residue
                    V[idx] = v
                else:
                    p.grad.data = p.grad.data / hvd.size()
                    hvd.allreduce_(p.grad.data, average=False)
                idx += 1

                pruning_time.update(time.time() - prune_begin)

            # Master
            idx = 0
            if args.use_pruning:
                pass
            else:
                for p in list(model.parameters()):
                    # print("accumulated sparsity is", check_sparsity(g))
                    # TODO 1. use pytorch sgd optimizer to calculate mom and weight_decay, set mom and wd
                    # used with pruning
                    # TODO 2. implement weight_decay and momentum by myself, set mom=0 and wd = 0
                    # used with baseline
                    g = p.grad.data
                    g += p.data * args.weight_decay
                    V[idx] = args.momentum * V[idx] + g
                    p.grad.data = V[k][idx]
                    clip_grad_norm(model.parameters(), 5.)
                    idx = idx + 1

            optimizer.step()

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % args.print_freq == 0:
            if hvd.local_rank() == 0:
                logging.info(
                    '{phase} - Epoch: [{0}][{1}/{2}]\t'
                    'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    'Prune {pruning_time.val:.9f} ({pruning_time.avg:.3f})\t'
                    'Select {select_time.val:.9f} ({select_time.avg:.3f})\t'
                    'Communication {comm_time.val:.9f} ({comm_time.avg:.3f})\t'
                    'Loss {loss.val:.4f} ({loss.avg:.4f})\t'
                    'Prec@1 {top1.val:.3f} ({top1.avg:.3f})\t'
                    'Prec@5 {top5.val:.3f} ({top5.avg:.3f})'.format(
                        epoch,
                        i,
                        len(data_loader),
                        phase='TRAINING' if training else 'EVALUATING',
                        batch_time=batch_time,
                        data_time=data_time,
                        pruning_time=pruning_time,
                        select_time=select_time,
                        comm_time=comm_time,
                        loss=losses,
                        top1=top1,
                        top5=top5))

    return {
        'loss': losses.avg,
        'prec1': top1.avg,
        'prec5': top5.avg,
        'U': U,
        'V': V
    }
def clip_grad_value_by_global_norm_(grad, name=None):
    grad_square_sum = torch.sum(grad.square())
    clip_value = torch.sqrt(allreduce_(grad_square_sum, average=True, name=name))
    grad.data.clamp_(min=-clip_value, max=clip_value)
Ejemplo n.º 20
0
    def batch_translate(self, input_path, output_path, field=0, remove_subword_tokens=True, max_length=100, resume=False):
        """Translate a file."""
        # Check whether using multiple GPUs
        try:
            import horovod.torch as hvd
        except ImportError:
            pass
        # If using multigpu, then separate the input file
        if self._is_multigpu:
            sync_tensor = torch.tensor(0)
            tmp_output_path = "/tmp/{}.{}".format(os.path.basename(output_path), hvd.local_rank())
        else:
            sync_tensor = None
            tmp_output_path = output_path
        result_map = {}
        if self._is_multigpu and resume and os.path.exists(tmp_output_path):
            for line in open(tmp_output_path):
                pair = line.strip("\n").split("\t")
                if len(pair) != 2:
                    print(line)
                id, line = pair
                result_map[int(id)] = line
            print("loaded {} computed results".format(len(result_map)))
        fout = open(tmp_output_path, "w")
        test_lines = list(open(input_path))
        err = 0
        for i, line in enumerate(test_lines):
            # Gather error counts in multigpu mode
            if self._is_multigpu:
                if i % (10 * hvd.size()) == 0:
                    sync_tensor.fill_(err)
                    hvd.allreduce_(sync_tensor, average=False)
                if i % hvd.size() != hvd.local_rank():
                    continue
            # Translate
            pair = line.strip().split("\t")
            src_sent = pair[field]
            if len(src_sent.split()) > max_length:
                result = "x"
            else:
                if i in result_map:
                    result = result_map[i]
                else:
                    result, _ = self.translate("<s> {} </s>".format(src_sent))

            if result is None:
                result = ""
            if remove_subword_tokens:
                if "▁" in result:
                    result = "".join(result.split()).replace("▁", " ").strip()
                else:
                    result = result.replace("@@ ", "")
            if not result:
                err += 1
            # Write the results and print progress
            if self._is_multigpu:
                fout.write("{}\t{}\n".format(i, result))
            else:
                fout.write("{}\n".format(result))
            fout.flush()
            if self._is_multigpu and hvd.local_rank() == 0:
                sys.stdout.write("translating: {:.0f}%  err: {}    \r".format(float(i + 1) * 100 / len(test_lines),
                                                                              int(sync_tensor)))
            elif not self._is_multigpu:
                sys.stdout.write("translating: {:.0f}%  err: {}    \r".format(float(i + 1) * 100 / len(test_lines), err))
            sys.stdout.flush()
        if is_root_node():
            sys.stdout.write("\n")
        fout.close()
        if self._is_multigpu:
            # Wait for all process to end
            hvd.allreduce_(sync_tensor, average=False)
            # Concatenate all separated translation results
            if hvd.local_rank() == 0:
                results = []
                for i in range(hvd.size()):
                    for line in open("/tmp/{}.{}".format(os.path.basename(output_path), i)):
                        id, result = line.strip("\n").split("\t")
                        results.append((int(id), result))
                results.sort()
                with open(output_path, "w") as fout:
                    for _, result in results:
                        fout.write(result + "\n")
Ejemplo n.º 21
0
 def tensor_mean_and_var(t: Tensor) -> Tuple[Tensor, Tensor]:
     mean = hvd.allreduce_(t.mean(dim=0))
     var = hvd.allreduce_((t - mean).pow(2).mean(dim=0))
     return mean, var
Ejemplo n.º 22
0
def get_eigen(model,
              inputs,
              targets,
              criterion,
              maxIter=50,
              tol=1e-3,
              comm=True):
    """
    compute the top eigenvalues of model parameters and
    the corresponding eigenvectors.

    change the model to evaluation mode, otherwise the batch Normalization Layer will change.
    If you call this functino during training, remember to change the mode back to training mode.
    model.eval()
    """

    model.eval()
    # torch.no_grad()

    #model_copy = squeezenet1_1(pretrained=False)
    #model_copy.load_state_dict(model.state_dict())
    #optimizer = optim.SGD(model_copy.parameters(), lr=0.001 * hvd.size(), momentum=0.9)
    outputs = model(inputs)
    loss = criterion(outputs, targets)
    loss.backward(create_graph=True)

    params, gradsH = get_params_grad(model)
    v = [torch.randn(p.size()) for p in params]
    v = normalization(v)
    if comm:
        hvd.broadcast_parameters(v, root_rank=0)

    eigenvalue = None

    for i in range(maxIter):
        print(i)
        model.zero_grad()
        Hv = hessian_vector_product(gradsH, params, v)
        if comm:
            handles = []
            for i in range(len(Hv)):
                handles.append(
                    hvd.allreduce_async_(
                        Hv[i],
                        name='reduce random vector update {}'.format(i)))
            for handle in handles:
                hvd.synchronize(handle)
        eigenvalue_tmp = group_product(Hv, v).item()
        v = normalization(Hv)
        if eigenvalue == None:
            eigenvalue = eigenvalue_tmp
        else:
            if abs(eigenvalue - eigenvalue_tmp) < tol:
                if comm:
                    return eigenvalue_tmp, v
            else:
                eigenvalue = eigenvalue_tmp
    if not comm:
        print("{} is here".format(hvd.rank()))
        eigenvalue = torch.FloatTensor([eigenvalue])
        hvd.allreduce_(eigenvalue, name='eigenvalue')
        print("allreduced eigs for rank {}".format(hvd.rank()))
        eigenvalue = float(eigenvalue)
        if hvd.rank() == 0:
            print("No Communication eigenvalue approximated at {}".format(
                eigenvalue))
    return eigenvalue, v