Example #1
0
    def test_horovod_broadcast_grad(self):
        """Test the correctness of the broadcast gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
                  torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
                       torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor,
                       torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        root_ranks = list(range(size))
        for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks):
            tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank)
            tensor = tensor.type(dtype)
            tensor = torch.autograd.Variable(tensor, requires_grad=True)

            broadcasted_tensor = hvd.broadcast(tensor, root_rank)
            broadcasted_tensor.backward(torch.ones([17] * dim))
            grad_out = tensor.grad.data.numpy()

            c = size if rank == root_rank else 0
            expected = np.ones([17] * dim) * c
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(err, 0.00000001,
                            "gradient %s differs from expected %s, "
                            "error: %s" % (grad_out, expected, str(err)))
Example #2
0
    def test_horovod_allreduce_grad(self):
        """Test the correctness of the allreduce gradient."""
        hvd.init()
        size = hvd.size()
        dtypes = [torch.IntTensor, torch.LongTensor,
                  torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor,
                       torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            tensor = tensor.type(dtype)
            tensor = torch.autograd.Variable(tensor, requires_grad=True)
            summed = hvd.allreduce(tensor, average=False)

            summed.backward(torch.ones([17] * dim))
            grad_out = tensor.grad.data.numpy()

            expected = np.ones([17] * dim) * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(err, 0.00000001,
                            "gradient %s differs from expected %s, "
                            "error: %s" % (grad_out, expected, str(err)))
Example #3
0
    def test_horovod_allgather(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
                  torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
                       torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor,
                       torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank)
            tensor = tensor.type(dtype)
            gathered = hvd.allgather(tensor)

            assert list(gathered.shape) == [17 * size] + [17] * (dim - 1)

            for i in range(size):
                rank_tensor = gathered[i * 17:(i + 1) * 17]
                assert list(rank_tensor.shape) == [17] * dim, \
                    'hvd.allgather produces incorrect gathered shape'
                assert rank_tensor.data.min() == i, 'hvd.allgather produces incorrect gathered tensor'
                assert rank_tensor.data.max() == i, 'hvd.allgather produces incorrect gathered tensor'
Example #4
0
    def test_horovod_allreduce_cpu_gpu_error(self):
        """Test that the allreduce raises an error if different ranks try to
        perform reduction on CPU and GPU."""
        # Only do this test if there are GPUs available.
        if not torch.cuda.is_available():
            return

        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        # Same rank, different dimension
        dims = [17] * 3
        if rank % 2 == 0:
            tensor = torch.cuda.FloatTensor(*dims)
        else:
            tensor = torch.FloatTensor(*dims)

        try:
            hvd.allreduce(tensor)
            assert False, 'hvd.allreduce did not throw error'
        except torch.FatalError:
            pass
Example #5
0
    def test_horovod_allreduce_inplace(self):
        """Test that the allreduce correctly sums 1D, 2D, 3D tensors."""
        hvd.init()
        size = hvd.size()
        dtypes = [torch.IntTensor, torch.LongTensor,
                  torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor,
                       torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            tensor = tensor.type(dtype)
            multiplied = tensor * size
            hvd.allreduce_(tensor, average=False)
            max_difference = tensor.sub(multiplied).max()

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor,
                                      torch.cuda.IntTensor, torch.cuda.LongTensor]:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
Example #6
0
    def test_horovod_allreduce_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different rank or dimension."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        # Same rank, different dimension
        torch.manual_seed(1234)
        dims = [17 + rank] * 3
        tensor = torch.FloatTensor(*dims).random_(-100, 100)
        try:
            hvd.allreduce(tensor)
            assert False, 'hvd.allreduce did not throw error'
        except torch.FatalError:
            pass

        # Same number of elements, different rank
        torch.manual_seed(1234)
        if rank == 0:
            dims = [17, 23 * 57]
        else:
            dims = [17, 23, 57]
        tensor = torch.FloatTensor(*dims).random_(-100, 100)
        try:
            hvd.allreduce(tensor)
            assert False, 'hvd.allreduce did not throw error'
        except torch.FatalError:
            pass
Example #7
0
    def test_horovod_broadcast_inplace(self):
        """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
                  torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
                       torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor,
                       torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        root_ranks = list(range(size))
        for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks):
            tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank)
            root_tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(root_rank)
            tensor = tensor.type(dtype)
            root_tensor = root_tensor.type(dtype)
            broadcasted_tensor = hvd.broadcast_(tensor, root_rank)
            assert (tensor == broadcasted_tensor).min() == 1, \
                'hvd.broadcast does not modify source tensor'
            assert (broadcasted_tensor == root_tensor).min() == 1, \
                'hvd.broadcast produces incorrect broadcasted tensor'
Example #8
0
    def backward(ctx, grad_output):
        grad_reduced = allreduce(grad_output, average=False)

        dim_t = torch.IntTensor([ctx.dim])
        dim = allgather(dim_t).view(size())

        r = rank()
        offset = torch.sum(dim.narrow(0, 0, r)).data[0] if r != 0 else 0
        return grad_reduced.narrow(0, offset, ctx.dim), None
Example #9
0
    def test_horovod_broadcast_rank_error(self):
        """Test that the broadcast returns an error if different ranks
        specify different root rank."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        tensor = torch.FloatTensor(*([17] * 3)).fill_(1)

        try:
            hvd.broadcast(tensor, rank)
            assert False, 'hvd.broadcast did not throw error'
        except torch.FatalError:
            pass
Example #10
0
    def test_horovod_allreduce_async_fused(self):
        """Test that the allreduce correctly sums 1D, 2D, 3D tensors
        with Tensor Fusion."""
        hvd.init()
        size = hvd.size()
        dtypes = [torch.IntTensor, torch.LongTensor,
                  torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor,
                       torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        tests = []
        is_hvd_poll_false_once = False
        for dtype, dim in itertools.product(dtypes, dims):
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            tensor = tensor.type(dtype)
            handle = hvd.allreduce_async(tensor, average=False)
            if not hvd.poll(handle):
                is_hvd_poll_false_once = True
            multiplied = tensor * size
            tests.append((dtype, multiplied, handle))

        # Make sure it's an asynchronous operation.
        assert is_hvd_poll_false_once, 'hvd.poll() always returns True, not an async op?'

        for dtype, multiplied, handle in tests:
            summed = hvd.synchronize(handle)
            max_difference = summed.sub(multiplied).max()

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor,
                                      torch.cuda.IntTensor, torch.cuda.LongTensor]:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
Example #11
0
    def test_horovod_allgather_grad(self):
        """Test the correctness of the allgather gradient."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
                  torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
                       torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor,
                       torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            # Support tests up to MPI Size of 35
            if size > 35:
                break

            tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5
            tensor_sizes = tensor_sizes[:size]

            tensor = torch.FloatTensor(
                *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank)
            tensor = tensor.type(dtype)
            tensor = torch.autograd.Variable(tensor, requires_grad=True)

            grad_list = []
            for r, size in enumerate(tensor_sizes):
                grad_list.append(torch.ones([size] + [17] * (dim - 1)) * r)
            grad_ys = torch.cat(grad_list, dim=0)

            gathered = hvd.allgather(tensor)
            gathered.backward(grad_ys)
            grad_out = tensor.grad.data.numpy()

            expected = np.ones(
                [tensor_sizes[rank]] + [17] * (dim - 1)
            ) * rank * size
            err = np.linalg.norm(expected - grad_out)
            self.assertLess(err, 0.00000001,
                            "gradient %s differs from expected %s, "
                            "error: %s" % (grad_out, expected, str(err)))
Example #12
0
    def test_horovod_broadcast_error(self):
        """Test that the broadcast returns an error if any dimension besides
        the first is different among the tensors being broadcasted."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        tensor_size = [17] * 3
        tensor_size[1] = 10 * (rank + 1)
        tensor = torch.FloatTensor(*tensor_size).fill_(1).mul_(rank)

        try:
            hvd.broadcast(tensor, 0)
            assert False, 'hvd.broadcast did not throw error'
        except torch.FatalError:
            pass
Example #13
0
    def test_horovod_broadcast_type_error(self):
        """Test that the broadcast returns an error if the types being broadcasted
        differ among the processes"""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        tensor_size = [17] * 3
        if rank % 2 == 0:
            tensor = torch.IntTensor(*tensor_size)
        else:
            tensor = torch.FloatTensor(*tensor_size)

        try:
            hvd.broadcast(tensor, 0)
            assert False, 'hvd.broadcast did not throw error'
        except torch.FatalError:
            pass
Example #14
0
    def test_horovod_allreduce_type_error(self):
        """Test that the allreduce raises an error if different ranks try to
        send tensors of different type."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        # This test does not apply if there is only one worker.
        if size == 1:
            return

        # Same rank, different dimension
        dims = [17] * 3
        if rank % 2 == 0:
            tensor = torch.IntTensor(*dims)
        else:
            tensor = torch.FloatTensor(*dims)

        try:
            hvd.allreduce(tensor)
            assert False, 'hvd.allreduce did not throw error'
        except torch.FatalError:
            pass
Example #15
0
    def test_horovod_allgather_variable_size(self):
        """Test that the allgather correctly gathers 1D, 2D, 3D tensors,
        even if those tensors have different sizes along the first dim."""
        hvd.init()
        rank = hvd.rank()
        size = hvd.size()

        dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor,
                  torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor]
        if torch.cuda.is_available():
            dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor,
                       torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor,
                       torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            # Support tests up to MPI Size of 35
            if size > 35:
                break

            tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
            tensor_sizes = tensor_sizes[:size]

            tensor = torch.FloatTensor(
                *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank)
            tensor = tensor.type(dtype)
            gathered = hvd.allgather(tensor)

            expected_size = sum(tensor_sizes)
            assert list(gathered.shape) == [expected_size] + [17] * (dim - 1)

            for i in range(size):
                rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
                rank_tensor = gathered[sum(
                    tensor_sizes[:i]):sum(tensor_sizes[:i + 1])]
                assert list(rank_tensor.shape) == rank_size
                assert rank_tensor.data.min() == i
                assert rank_tensor.data.max() == i
Example #16
0
    def test_horovod_allreduce_multi_gpu(self):
        """Test that the allreduce works on multiple GPUs."""
        # Only do this test if there are GPUs available.
        if not torch.cuda.is_available():
            return

        hvd.init()
        local_rank = hvd.local_rank()
        size = hvd.size()

        iter = 0
        dtypes = [torch.cuda.IntTensor, torch.cuda.LongTensor,
                  torch.cuda.FloatTensor, torch.cuda.DoubleTensor]
        dims = [1, 2, 3]
        for dtype, dim in itertools.product(dtypes, dims):
            iter += 1
            torch.manual_seed(1234)
            tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100)
            device = local_rank * 2 + (iter + local_rank) % 2
            tensor = tensor.cuda(device).type(dtype)
            multiplied = tensor * size
            hvd.allreduce_(tensor, average=False)
            max_difference = tensor.sub(multiplied).max()

            # Threshold for floating point equality depends on number of
            # ranks, since we're comparing against precise multiplication.
            if size <= 3 or dtype in [torch.cuda.IntTensor, torch.cuda.LongTensor]:
                threshold = 0
            elif size < 10:
                threshold = 1e-4
            elif size < 15:
                threshold = 5e-4
            else:
                break

            assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
Example #17
0
    def synchronize(self):
        if hvd.size() > 1:
            for p in self._handles:
                handle = self._handles[p]
                synchronize(handle)
                #p_size = np.prod(p.size())

                p_size = np.prod(p.size())  #torch.numel(p)
                if self._use_allgather and p_size > self._plan1:

                    torch.cuda.synchronize()
                    begin_time_sync = time.time()
                    #fjr decompress
                    name = self._parameter_names.get(p)

                    g_size = p.grad.data.size()
                    p_flatten = p.grad.data.view(-1)
                    p_flatten.zero_()

                    torch.cuda.synchronize()
                    begin_unpack_time = time.time()
                    if self._use_gpu:
                        count_nnz = 0
                        if p_size > self._plan3:
                            offset = 0
                            for node_idx in range(hvd.size()):
                                msg_size = self._compressed_msg[name][
                                    offset].type('torch.cuda.LongTensor')
                                offset += 1
                                p_flatten[self._compressed_msg[name][ offset: \
                                        offset + msg_size].type('torch.cuda.LongTensor')] += \
                                        self._compressed_msg[name][offset + msg_size : \
                                        offset + 2*msg_size]
                                offset += msg_size * 2
                            count_nnz += msg_size
                        else:
                            msg_size = self._compressed_msg_size[name]
                            for node_idx in range(hvd.size()):
                                p_flatten[self._compressed_msg[name][node_idx*msg_size*2 : \
                                    node_idx*msg_size*2 + msg_size].type('torch.cuda.LongTensor')] += \
                                    self._compressed_msg[name][node_idx*msg_size*2 + msg_size : \
                                    node_idx*msg_size*2 + 2*msg_size]

                        #if hvd.rank() == 0:
                        #    print("sparsity ", name, check_sparsity(p_flatten))

                    p.grad.data = p_flatten.view(g_size)
                    torch.cuda.synchronize()
                    self.unpack_time += time.time() - begin_unpack_time
                    torch.cuda.synchronize()
                    self.pruning_time += time.time() - begin_time_sync

                    if self._debug:
                        diff = torch.sum(self._v_ref[name] - p.grad.data)
                        if (torch.abs(diff) > 1e-3):
                            print("error diff is, ", diff, name, p.size())

                else:
                    pass

        self._handles.clear()
Example #18
0
def test_resnet(affine, track_running_stats):
    torch.manual_seed(0)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    if torch.cuda.is_available():
        device = torch.device('cuda:%d' % hvd.rank())
    else:
        device = torch.device('cpu')

    if hvd.rank() == 0:
        print('affine:', affine, 'track_running_stats', track_running_stats)

    bn = functools.partial(nn.BatchNorm2d,
                           track_running_stats=track_running_stats,
                           affine=affine)
    sync_bn = functools.partial(SynchronizedBatchNorm2d,
                                track_running_stats=track_running_stats,
                                affine=affine)

    # prepare model
    model = models.resnet18(norm_layer=bn).to(device)
    sync_model = models.resnet18(norm_layer=sync_bn).to(device)
    sync_model.load_state_dict(model.state_dict())
    # print(sync_model)

    # prepare inputs
    num_samples = 8
    num_steps = 10
    inputs = torch.rand(num_steps, num_samples, 3, 32, 32).float().to(device)
    start_idx = hvd.rank() * int(num_samples / hvd.size())
    end_idx = (hvd.rank() + 1) * int(num_samples / hvd.size())

    # test inference
    if hvd.rank() == 0:
        print('[INFERENCE PHASE-1]')

    model.eval()
    with torch.no_grad():
        for i in range(num_steps):
            t1 = time.time()
            outputs = model(inputs[i])
            t2 = (time.time() - t1) * 1000
            if hvd.rank() == 0:
                view('model.outputs.%d-%.4f' % (i, t2), outputs)

    sync_model.eval()
    with torch.no_grad():
        for i in range(num_steps):
            t1 = time.time()
            outputs = sync_model(inputs[i])
            t2 = (time.time() - t1) * 1000
            if hvd.rank() == 0:
                view('sync_model.outputs.%d-%.4f' % (i, t2), outputs)

    # test training
    if hvd.rank() == 0:
        print('[TRAINING PHASE]')

    # using pytorch-official version
    model.train()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
    for i in range(num_steps):
        t1 = time.time()
        outputs = model(inputs[i])
        t2 = (time.time() - t1) * 1000
        loss = outputs.mean()
        optimizer.zero_grad()
        loss.backward()
        if hvd.rank() == 0:
            view('model.outputs.%d-%.4f' % (i, t2), outputs)
        optimizer.step()

    # using sync-version
    sync_model.train()
    optimizer = torch.optim.SGD(sync_model.parameters(), lr=0.1)
    optimizer = hvd.DistributedOptimizer(
        optimizer, named_parameters=sync_model.named_parameters())
    hvd.broadcast_parameters(sync_model.state_dict(), root_rank=0)

    for i in range(num_steps):
        t1 = time.time()
        outputs = sync_model(inputs[i, start_idx:end_idx])
        t2 = (time.time() - t1) * 1000
        loss = outputs.mean()
        optimizer.zero_grad()
        loss.backward()
        outputs = hvd.allgather(outputs)
        if hvd.rank() == 0:
            view('sync_model.outputs.%d-%.4f' % (i, t2), outputs)
        optimizer.step()

    # test inference
    if hvd.rank() == 0:
        print('[INFERENCE PHASE-2]')

    model.eval()
    with torch.no_grad():
        for i in range(num_steps):
            t1 = time.time()
            outputs = model(inputs[i])
            t2 = (time.time() - t1) * 1000
            if hvd.rank() == 0:
                view('model.outputs.%d-%.4f' % (i, t2), outputs)

    sync_model.eval()
    with torch.no_grad():
        for i in range(num_steps):
            t1 = time.time()
            outputs = sync_model(inputs[i])
            t2 = (time.time() - t1) * 1000
            if hvd.rank() == 0:
                view('sync_model.outputs.%d-%.4f' % (i, t2), outputs)

    if hvd.rank() == 0:
        # for key, value in sync_model.state_dict().items():
        #   print(key, value.shape)
        print('\n')
Example #19
0
    models.squeezenet: models.squeezenet.__all__[1:],
    models.vgg: models.vgg.__all__[1:],
    models.mobilenet: models.mobilenet.__all__[1:],
    models.shufflenetv2: models.shufflenetv2.__all__[1:]
}

precisions = ["float", "half"]
for precision in precisions:
    for model_type in MODEL_LIST.keys():
        for model_name in MODEL_LIST[model_type]:
            # Set up standard model.
            model = getattr(model_type, model_name)()
            model = getattr(model, precision)()

            # By default, Adasum doesn't need scaling up learning rate.
            lr_scaler = hvd.size() if not args.use_adasum else 1

            if args.cuda:
                # Move model to GPU.
                model.cuda()
                # If using GPU Adasum allreduce, scale learning rate by local_size.
                if args.use_adasum and hvd.nccl_built():
                    lr_scaler = hvd.local_size()

            optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler)

            # Horovod: (optional) compression algorithm.
            compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

            # Horovod: wrap optimizer with DistributedOptimizer.
            optimizer = hvd.DistributedOptimizer(
Example #20
0
    np.random.seed(seed)
    torch.backends.cudnn.deterministic = True

    # Distributed: set up horovod over multiple gpu's
    if distributed:
        import horovod.torch as hvd

        # initialize horovod
        hvd.init()

        # pin gpu to "local rank" (see Horovod documentation)
        torch.cuda.set_device(hvd.local_rank())
        print(f"My local rank is {hvd.local_rank()}")

        # distribute mini-batches over the different gpu's
        batch_size //= hvd.size()

    # string-tag for logging
    tag = f'nz{nz}'

    # define the "root process": only one of the gpu's has to log relevant values
    # set only one gpu as root process
    root_process = True
    if distributed and not hvd.rank() == 0:
        root_process = False

    # set GPU/CPU options
    use_cuda = torch.cuda.is_available()
    cudastring = "cuda" if distributed else f"cuda:{gpu}"
    device = torch.device(cudastring if use_cuda else "cpu")
    def horovod_train(self, model):
        # call setup after the ddp process has connected
        if not self.testing:
            self.setup('fit')
            model.setup('fit')

        if torch.cuda.is_available() and self.on_gpu:
            # Horovod: pin GPU to local rank
            assert self.root_gpu == hvd.local_rank()
            torch.cuda.set_device(self.root_gpu)
            model.cuda(self.root_gpu)

        # avoid duplicating progress bar
        if hvd.rank() != 0 and self.progress_bar_callback is not None:
            self.progress_bar_callback.disable()

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers(
            model)

        # Horovod: scale the learning rate by the number of workers to account for
        # increased total batch size
        for optimizer in self.optimizers:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= hvd.size()

        # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR
        for scheduler in self.lr_schedulers:
            scheduler = scheduler['scheduler']
            if isinstance(scheduler, _LRScheduler):
                scheduler.base_lrs = [
                    lr * hvd.size() for lr in scheduler.base_lrs
                ]

        if self.use_amp:
            model, optimizers = model.configure_apex(amp, model,
                                                     self.optimizers,
                                                     self.amp_level)
            self.optimizers = optimizers
            self.reinit_scheduler_properties(self.optimizers,
                                             self.lr_schedulers)

        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        for optimizer in self.optimizers:
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        def filter_named_parameters(model, optimizer):
            opt_params = set([
                p for group in optimizer.param_groups
                for p in group.get('params', [])
            ])
            return [(name, p) for name, p in model.named_parameters()
                    if p in opt_params]

        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
        self.optimizers = [
            hvd.DistributedOptimizer(optimizer,
                                     named_parameters=filter_named_parameters(
                                         model, optimizer))
            for optimizer in self.optimizers
        ]

        # Update logger rank info from Horovod to avoid race conditions from  different ranks
        # creating directories / writing files in the same locations.
        self.global_rank = hvd.rank()
        rank_zero_only.rank = self.global_rank

        with ExitStack() as stack:
            for optimizer in self.optimizers:
                # Synchronization will be performed explicitly following backward()
                stack.enter_context(optimizer.skip_synchronize())

            result = self.run_pretrain_routine(model)

        # Make sure all workers have finished training before returning to the user
        hvd.join()
        return result
if args.cuda:
    # Horovod: pin GPU to local rank.
    torch.cuda.set_device(hvd.local_rank())
    torch.cuda.manual_seed(args.seed)


kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_dataset = \
    datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)

test_dataset = \
    datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ]))
test_sampler = torch.utils.data.distributed.DistributedSampler(
    test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,
                                          sampler=test_sampler, **kwargs)


class Net(nn.Module):
Example #23
0
def main(opts):
    hvd.init()
    n_gpu = hvd.size()
    device = torch.device("cuda", hvd.local_rank())
    torch.cuda.set_device(hvd.local_rank())
    rank = hvd.rank()
    LOGGER.info("device: {} n_gpu: {}, rank: {}, "
                "16-bits training: {}".format(device, n_gpu, hvd.rank(),
                                              opts.fp16))
    if hvd.rank() != 0:
        LOGGER.disabled = True
    hps_file = f'{opts.output_dir}/log/hps.json'
    model_opts = Struct(load_json(hps_file))
    model_config = f'{opts.output_dir}/log/model_config.json'

    # load DBs and image dirs
    video_ids = get_video_ids(opts.query_txt_db)
    if opts.task != "didemo_video_only":
        video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db,
                                          model_opts.vfeat_interval,
                                          model_opts)
    else:
        txt_meta = load_json(os.path.join(opts.query_txt_db, "meta.json"))
        video_db = load_video_only_dataset(opts.vfeat_db, txt_meta,
                                           model_opts.vfeat_interval,
                                           model_opts)
    assert opts.split in opts.query_txt_db
    q_txt_db = QueryTokLmdb(opts.query_txt_db, -1)
    if opts.task != "didemo_video_only":
        inf_dataset = VcmrFullEvalDataset
    else:
        inf_dataset = VcmrVideoOnlyFullEvalDataset
    eval_dataset = inf_dataset(video_ids,
                               video_db,
                               q_txt_db,
                               distributed=model_opts.distributed_eval)

    # Prepare model
    if exists(opts.checkpoint):
        ckpt_file = opts.checkpoint
    else:
        ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt'
    checkpoint = torch.load(ckpt_file)
    img_pos_embed_weight_key = ("v_encoder.f_encoder.img_embeddings" +
                                ".position_embeddings.weight")
    assert img_pos_embed_weight_key in checkpoint
    max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key])

    model = HeroForVcmr.from_pretrained(
        model_config,
        state_dict=checkpoint,
        vfeat_dim=VFEAT_DIM,
        max_frm_seq_len=max_frm_seq_len,
        lw_neg_ctx=model_opts.lw_neg_ctx,
        lw_neg_q=model_opts.lw_neg_q,
        lw_st_ed=0,
        ranking_loss_type=model_opts.ranking_loss_type,
        use_hard_negative=False,
        hard_pool_size=model_opts.hard_pool_size,
        margin=model_opts.margin,
        use_all_neg=model_opts.use_all_neg,
        drop_svmr_prob=model_opts.drop_svmr_prob)
    model.to(device)
    if opts.fp16:
        model = amp.initialize(model, enabled=opts.fp16, opt_level='O2')

    eval_dataloader = DataLoader(eval_dataset,
                                 batch_size=opts.batch_size,
                                 num_workers=opts.n_workers,
                                 pin_memory=opts.pin_mem,
                                 collate_fn=vcmr_full_eval_collate)
    eval_dataloader = PrefetchLoader(eval_dataloader)

    _, results = validate_full_vcmr(model, eval_dataloader, opts.split, opts,
                                    model_opts)
    result_dir = f'{opts.output_dir}/results_{opts.split}'

    if not exists(result_dir) and rank == 0:
        os.makedirs(result_dir)

    all_results = list(concat(all_gather_list(results)))
    if hvd.rank() == 0:
        save_json(all_results,
                  f'{result_dir}/results_{opts.checkpoint}_all.json')
        LOGGER.info('All results written......')
Example #24
0
if args.cuda:
    # Horovod: pin GPU to local rank.
    torch.cuda.set_device(hvd.local_rank())
    torch.cuda.manual_seed(args.seed)


kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
train_dataset = \
    datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs)

test_dataset = \
    datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ]))
test_sampler = torch.utils.data.distributed.DistributedSampler(
    test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size,
                                          sampler=test_sampler, **kwargs)


class Net(nn.Module):
    # This flag allows you to enable the inbuilt cudnn
    # auto-tuner to find the best algorithm to use for your hardware.
    cudnn.benchmark = True

    if hvd.rank() is 0:
        # Announce
        print(args)

        # Init tensorboard
        rmtree(args.tensorboard_path, ignore_errors=True)
        writer = SummaryWriter(args.tensorboard_path)

    # DataLoader
    train_dataset = WFLW('train', path=args.data_dir)
    train_sampler = DistributedSampler(dataset=train_dataset,
                                       num_replicas=hvd.size(),
                                       rank=hvd.rank())
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=args.per_batch,
                              sampler=train_sampler)

    # Model
    # norm实现有问题
    models = [
        BoundaryHeatmapEstimator(
            args.img_channels,
            args.hourglass_channels,
            args.boundary,
        ).cuda(),
    ]
    models.append(LandmarksRegressor(channels=args.hourglass_channels).cuda())
Example #26
0
def main(args):
    # Create a model, synthetic data, and a guide.
    pyro.set_rng_seed(args.seed)
    model = Model(args.size)
    covariates = torch.randn(args.size)
    data = model(covariates)
    guide = AutoNormal(model)

    if args.horovod:
        # Initialize Horovod and set PyTorch globals.
        import horovod.torch as hvd
        hvd.init()
        torch.set_num_threads(1)
        if args.cuda:
            torch.cuda.set_device(hvd.local_rank())
    if args.cuda:
        torch.set_default_tensor_type("torch.cuda.FloatTensor")
    device = torch.tensor(0).device

    if args.horovod:
        # Initialize parameters and broadcast to all workers.
        guide(covariates[:1], data[:1])  # Initializes model and guide.
        hvd.broadcast_parameters(guide.state_dict(), root_rank=0)
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    # Create an ELBO loss and a Pyro optimizer.
    elbo = Trace_ELBO()
    optim = Adam({"lr": args.learning_rate})

    if args.horovod:
        # Wrap the basic optimizer in a distributed optimizer.
        optim = HorovodOptimizer(optim)

    # Create a dataloader.
    dataset = torch.utils.data.TensorDataset(covariates, data)
    if args.horovod:
        # Horovod requires a distributed sampler.
        sampler = torch.utils.data.distributed.DistributedSampler(
            dataset, hvd.size(), hvd.rank())
    else:
        sampler = torch.utils.data.RandomSampler(dataset)
    config = {"batch_size": args.batch_size, "sampler": sampler}
    if args.cuda:
        config["num_workers"] = 1
        config["pin_memory"] = True
        # Try to use forkserver to spawn workers instead of fork.
        if (hasattr(mp, "_supports_context") and mp._supports_context and
                "forkserver" in mp.get_all_start_methods()):
            config["multiprocessing_context"] = "forkserver"
    dataloader = torch.utils.data.DataLoader(dataset, **config)

    # Run stochastic variational inference.
    svi = SVI(model, guide, optim, elbo)
    for epoch in range(args.num_epochs):
        if args.horovod:
            # Set rng seeds on distributed samplers. This is required.
            sampler.set_epoch(epoch)

        for step, (covariates_batch, data_batch) in enumerate(dataloader):
            loss = svi.step(covariates_batch.to(device), data_batch.to(device))

            if args.horovod:
                # Optionally average loss metric across workers.
                # You can do this with arbitrary torch.Tensors.
                loss = torch.tensor(loss)
                loss = hvd.allreduce(loss, "loss")
                loss = loss.item()

                # Print only on the rank=0 worker.
                if step % 100 == 0 and hvd.rank() == 0:
                    print("epoch {} step {} loss = {:0.4g}".format(epoch, step, loss))
            else:
                if step % 100 == 0:
                    print("epoch {} step {} loss = {:0.4g}".format(epoch, step, loss))

    if args.horovod:
        # After we're done with the distributed parts of the program,
        # we can shutdown all but the rank=0 worker.
        hvd.shutdown()
        if hvd.rank() != 0:
            return

    if args.outfile:
        print("saving to {}".format(args.outfile))
        torch.save({"model": model, "guide": guide}, args.outfile)
Example #27
0
def split_by_rank(data):
    points_per_rank = int(len(data) / hvd.size())
    first = hvd.rank() * points_per_rank
    last = first + points_per_rank
    return data[first:last]
Example #28
0
 def get_world_size(self) -> int:
     return hvd.size()
    def __init__(self,
                 model,
                 lr=0.1,
                 factor_decay=0.95,
                 damping=0.001,
                 kl_clip=0.001,
                 fac_update_freq=10,
                 kfac_update_freq=100,
                 batch_averaged=True,
                 diag_blocks=1,
                 diag_warmup=0,
                 distribute_layer_factors=None,
                 sparse=False,
                 sparse_ratio=0.01,
                 exclude_parts=''):

        if not 0.0 <= lr:
            raise ValueError("Invalid learning rate: {}".format(lr))
        if not 0.0 < factor_decay <= 1:
            raise ValueError(
                "Invalid factor decay rate: {}".format(factor_decay))
        if not 0.0 < damping:
            raise ValueError("Invalid damping: {}".format(damping))
        if not 0.0 < kl_clip:
            raise ValueError("Invalid clipping value: {}".format(kl_clip))
        if not 0 < fac_update_freq:
            raise ValueError(
                "Invalid factor update frequency: {}".format(fac_update_freq))
        if not 0 < kfac_update_freq:
            raise ValueError(
                "Invalid K-FAC update frequency: {}".format(kfac_update_freq))
        if not 0 == kfac_update_freq % fac_update_freq:
            print(
                "WARNING: it is suggested that kfac_update_freq be a multiple of fac_update_freq"
            )
        if not 0 < diag_blocks:
            raise ValueError(
                "Invalid diagonal block approx count: {}".format(diag_blocks))
        if not 0 <= diag_blocks:
            raise ValueError(
                "Invalid diagonal block approx count: {}".format(diag_blocks))
        if not 1 == diag_blocks:
            print(
                "WARNING: diag_blocks > 1 is experimental and may give poor results."
            )

        # For compatibility with `KFACParamScheduler`
        defaults = dict(lr=lr,
                        damping=damping,
                        fac_update_freq=fac_update_freq,
                        kfac_update_freq=kfac_update_freq)

        super(KFAC, self).__init__(model.parameters(), defaults)

        self.computeA = ComputeA()
        self.computeG = ComputeG()
        self.known_modules = {'Linear', 'Conv2d'}
        self.modules = []
        self.module_names = []
        self.name_module_map = {}
        self.module_name_map = {}
        self._register_modules(model)
        self.fw_merged_comm = MergedCommAllReduce(self.module_names,
                                                  prefix='forward',
                                                  merge=True,
                                                  single_layer=False)
        self.bw_merged_comm = MergedCommAllReduce(self.module_names,
                                                  prefix='backward',
                                                  merge=False,
                                                  single_layer=False)
        self.inverseA_merged_comm = MergedCommBcast(self.module_names,
                                                    prefix='inverseA')
        self.inverseG_merged_comm = MergedCommBcast(self.module_names,
                                                    prefix='inverseG')
        self.multi_comm = MultiTensorComm()
        self.steps = 0

        # Dictionaries keyed by `module` to storing the factors and
        # eigendecompositions
        self.m_a, self.m_g = {}, {}
        self.m_A, self.m_G = {}, {}
        self.m_QA, self.m_QG = {}, {}
        self.m_dA_ranks = {}
        self.m_dG_ranks = {}
        self.module_ranks = None

        self.sparse = sparse
        self.sparse_ratio = sparse_ratio
        self.residualsA, self.residualsG = {}, {}

        self.factor_decay = factor_decay
        self.kl_clip = kl_clip
        self.fac_update_freq = fac_update_freq
        self.kfac_update_freq = kfac_update_freq
        self.diag_blocks = diag_blocks
        self.diag_warmup = diag_warmup
        self.batch_averaged = batch_averaged

        # Compute ideal value for `distribute_layer_factors` based on
        # registered module count
        if distribute_layer_factors is None:
            self.distribute_layer_factors = True \
                    if hvd.size() > len(self.modules) else False
        else:
            self.distribute_layer_factors = distribute_layer_factors

        self.have_cleared_Q = True if self.diag_warmup == 0 else False
        self.eps = 1e-10  # for numerical stability
        self.rank_iter = cycle(list(range(hvd.size())))
Example #30
0
    parser.add_argument('--max_grad_norm', default=1.0, type=float, help='')

    args = parser.parse_args()
    if args.device == 'cuda':
        args.device = 'cuda' if cuda.is_available() else 'cpu'
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    hvd.init()
    if cuda.is_available():
        torch.cuda.set_device(hvd.local_rank())

    tokenizer = BertTokenizer.from_pretrained(args.tokenizer_model)
    tokenizer.add_tokens("[GENERATE_ARTICLE]")
    args.tokenizer = tokenizer
    args.rank_size = hvd.size()
    print(args)

    train_df = pd.read_csv(args.train_df_path)
    train_loader, train_sampler = get_dataLoader(train_df)

    model = ConditionalGenerationModel(**args).to(args.device)
    #     model=torch.nn.DataParallel(model).to(args.device)#并行程序会卡住,不晓得为什么
    compression = hvd.Compression.fp16 if args.compression_fp16 else hvd.Compression.none
    optim = torch.optim.SGD(model.parameters(),
                            lr=args.base_lr * hvd.local_size() *
                            args.batches_per_allreduce,
                            momentum=args.momentum,
                            weight_decay=args.wd)

    optim = hvd.DistributedOptimizer(
Example #31
0
            and 'forkserver' in mp.get_all_start_methods()):
        kwargs['multiprocessing_context'] = 'forkserver'

    train_dataset = \
        datasets.ImageFolder(args.train_dir,
                             transform=transforms.Compose([
                                 transforms.RandomResizedCrop(224),
                                 transforms.RandomHorizontalFlip(),
                                 transforms.ToTensor(),
                                 transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                      std=[0.229, 0.224, 0.225])
                             ]))
    # Horovod: use DistributedSampler to partition data among workers. Manually specify
    # `num_replicas=hvd.size()` and `rank=hvd.rank()`.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=allreduce_batch_size,
                                               sampler=train_sampler,
                                               **kwargs)

    val_dataset = \
        datasets.ImageFolder(args.val_dir,
                             transform=transforms.Compose([
                                 transforms.Resize(256),
                                 transforms.CenterCrop(224),
                                 transforms.ToTensor(),
                                 transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                                      std=[0.229, 0.224, 0.225])
                             ]))
    val_sampler = torch.utils.data.distributed.DistributedSampler(
Example #32
0
def train_fn(data_dir=None,
             seed=42,
             use_cuda=False,
             batch_size=64,
             use_adasum=False,
             lr=0.01,
             momentum=0.5,
             num_epochs=10,
             log_interval=10):
    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(seed)

    if use_cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(seed)

    # Horovod: limit # of CPU threads to be used per worker.
    torch.set_num_threads(1)

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}
    data_dir = data_dir or "./data"
    with FileLock(os.path.expanduser("~/.horovod_lock")):
        train_dataset = \
            datasets.MNIST(data_dir, train=True, download=True,
                           transform=transforms.Compose([
                               transforms.ToTensor(),
                               transforms.Normalize((0.1307,), (0.3081,))
                           ]))
    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               sampler=train_sampler,
                                               **kwargs)

    model = Net()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not use_adasum else 1

    if use_cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    # Horovod: scale learning rate by lr_scaler.
    optimizer = optim.SGD(model.parameters(),
                          lr=lr * lr_scaler,
                          momentum=momentum)

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        op=hvd.Adasum if use_adasum else hvd.Average)

    for epoch in range(1, num_epochs + 1):
        model.train()
        # Horovod: set epoch to sampler for shuffling.
        train_sampler.set_epoch(epoch)
        for batch_idx, (data, target) in enumerate(train_loader):
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % log_interval == 0:
                # Horovod: use train_sampler to determine the number of
                # examples in this worker's partition.
                print("Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}".format(
                    epoch, batch_idx * len(data), len(train_sampler),
                    100. * batch_idx / len(train_loader), loss.item()))
Example #33
0
parser.add_argument(
    '--exit-mode',
    default='exception',
    help='means used to cause a worker to exit [exception | kill]')

args = parser.parse_args()

hvd.init()

batch_size = 32
data = torch.randn(batch_size, 2)
target = torch.LongTensor(batch_size).random_() % 2

lr = 0.001
model = torch.nn.Sequential(torch.nn.Linear(2, 2))
optimizer = torch.optim.SGD(model.parameters(), lr=lr * hvd.size())
optimizer = hvd.DistributedOptimizer(optimizer,
                                     named_parameters=model.named_parameters())

hostname = os.environ.get('HOROVOD_HOSTNAME')
start_rank = int(os.environ.get('HOROVOD_RANK', 0))

discovery_schedule = json.loads(args.discovery_schedule)
epoch_to_hosts = {
    epoch: hosts
    for epoch, hosts in discovery_schedule if epoch is not None
}
default_hosts = discovery_schedule[-1][1] if discovery_schedule else []

exit_schedule = json.loads(args.exit_schedule) if args.exit_schedule else {}
Example #34
0
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

print('cwd:', os.getcwd())
train_dataset = torchvision.datasets.CIFAR10(
    root='~/distributed-training/data',
    train=True,
    download=False,
    transform=transform)

# Horovod: use DistributedSampler to partition data among workers. Manually specify
# `num_replicas=hvd.size()` and `rank=hvd.rank()`.
train_sampler = torch.utils.data.distributed.DistributedSampler(
    train_dataset, num_replicas=hvd.size(), rank=hvd.rank())

train_loader = torch.utils.data.DataLoader(train_dataset,
                                           batch_size=allreduce_batch_size,
                                           sampler=train_sampler,
                                           **kwargs)

# Set up standard ResNet-50 model.
model = models.resnet101()

if args.cuda:
    # Move model to GPU.
    model.cuda()

# Horovod: scale learning rate by the number of GPUs.
# Gradient Accumulation: scale learning rate by batches_per_allreduce
Example #35
0
    def setup(self, model):
        # call setup after the ddp process has connected
        self.trainer.call_setup_hook(model)

        if torch.cuda.is_available() and self.trainer.on_gpu:
            # Horovod: pin GPU to local rank
            assert self.trainer.root_gpu == hvd.local_rank()
            torch.cuda.set_device(self.trainer.root_gpu)
            model.cuda(self.trainer.root_gpu)

        # avoid duplicating progress bar
        if hvd.rank() != 0 and self.trainer.progress_bar_callback is not None:
            self.trainer.progress_bar_callback.disable()

        # CHOOSE OPTIMIZER
        # allow for lr schedulers as well
        optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers(
            model)
        self.trainer.optimizers = optimizers
        self.trainer.lr_schedulers = lr_schedulers
        self.trainer.optimizer_frequencies = optimizer_frequencies

        # Horovod: scale the learning rate by the number of workers to account for
        # increased total batch size
        for optimizer in self.trainer.optimizers:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= hvd.size()

        # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR
        for scheduler in self.trainer.lr_schedulers:
            scheduler = scheduler['scheduler']
            if isinstance(scheduler, _LRScheduler):
                scheduler.base_lrs = [
                    lr * hvd.size() for lr in scheduler.base_lrs
                ]

        if self.trainer.amp_backend:
            model, optimizers = model.configure_apex(amp, model,
                                                     self.trainer.optimizers,
                                                     self.trainer.amp_level)
            self.trainer.optimizers = optimizers
            self.trainer.reinit_scheduler_properties(
                self.trainer.optimizers, self.trainer.lr_schedulers)

        # Horovod: broadcast parameters & optimizer state to ensure consistent initialization
        hvd.broadcast_parameters(model.state_dict(), root_rank=0)
        for optimizer in self.trainer.optimizers:
            hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        def filter_named_parameters(model, optimizer):
            opt_params = set([
                p for group in optimizer.param_groups
                for p in group.get('params', [])
            ])
            return [(name, p) for name, p in model.named_parameters()
                    if p in opt_params]

        # Horovod: wrap optimizers to perform gradient aggregation via allreduce
        self.trainer.optimizers = [
            hvd.DistributedOptimizer(optimizer,
                                     named_parameters=filter_named_parameters(
                                         model, optimizer))
            for optimizer in self.trainer.optimizers
        ]

        # Update logger rank info from Horovod to avoid race conditions from  different ranks
        # creating directories / writing files in the same locations.
        self.trainer.global_rank = hvd.rank()
        rank_zero_only.rank = self.trainer.global_rank

        self.trainer.model = model
Example #36
0
    tokenizer = BertTokenizer.from_pretrained(args.pretrained_weights)
    model = SurveyClassifier.from_pretrained(args.pretrained_weights)
    model.to(device)
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    print("Shuffling")
    shuffle(training_data)

    ##############################################################################

    print("Training specialty model")
    loss_fn = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(
        filter(lambda x: x.requires_grad, model.parameters()),
        lr=args.learning_rate * hvd.size(),
    )
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
    )
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    best_validation_loss = None
    best_model = None
    for epoch in range(args.epochs):
        if epoch == args.unfreeze_bert_epoch:
            model.unfreeze_layers_starting_with(11)
        if epoch == args.unfreeze_bert_epoch * 2:
            model.unfreeze_layers_starting_with(10)
        for phase in ["train", "validate"]:
Example #37
0
def evaluate(args):
    # initialize Horovod library
    hvd.init()
    # Horovod limits CPU threads to be used per worker
    torch.set_num_threads(1)

    if hvd.local_rank() == 0 and not os.path.exists(args.dir):
        # create 16 random image, mask paris for evaluation
        print(
            f"generating synthetic data to {args.dir} (this may take a while)")
        os.makedirs(args.dir)
        # set random seed to generate same random data for every node
        np.random.seed(seed=0)
        for i in range(16):
            im, seg = create_test_image_3d(128,
                                           128,
                                           128,
                                           num_seg_classes=1,
                                           channel_dim=-1)
            n = nib.Nifti1Image(im, np.eye(4))
            nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz"))
            n = nib.Nifti1Image(seg, np.eye(4))
            nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz"))

    images = sorted(glob(os.path.join(args.dir, "img*.nii.gz")))
    segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz")))
    val_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)]

    # define transforms for image and segmentation
    val_transforms = Compose([
        LoadNiftid(keys=["img", "seg"]),
        AsChannelFirstd(keys=["img", "seg"], channel_dim=-1),
        ScaleIntensityd(keys="img"),
        ToTensord(keys=["img", "seg"]),
    ])

    # create a evaluation data loader
    val_ds = Dataset(data=val_files, transform=val_transforms)
    # create a evaluation data sampler
    val_sampler = DistributedSampler(val_ds,
                                     shuffle=False,
                                     num_replicas=hvd.size(),
                                     rank=hvd.rank())
    # when supported, use "forkserver" to spawn dataloader workers instead of "fork" to prevent
    # issues with Infiniband implementations that are not fork-safe
    multiprocessing_context = None
    if hasattr(
            mp, "_supports_context"
    ) and mp._supports_context and "forkserver" in mp.get_all_start_methods():
        multiprocessing_context = "forkserver"
    # sliding window inference need to input 1 image in every iteration
    val_loader = DataLoader(
        val_ds,
        batch_size=1,
        shuffle=False,
        num_workers=2,
        pin_memory=True,
        sampler=val_sampler,
        multiprocessing_context=multiprocessing_context,
    )
    dice_metric = DiceMetric(include_background=True, reduction="mean")
    post_trans = Compose(
        [Activations(sigmoid=True),
         AsDiscrete(threshold_values=True)])
    # create UNet, DiceLoss and Adam optimizer
    device = torch.device(f"cuda:{hvd.local_rank()}")
    torch.cuda.set_device(device)
    model = monai.networks.nets.UNet(
        dimensions=3,
        in_channels=1,
        out_channels=1,
        channels=(16, 32, 64, 128, 256),
        strides=(2, 2, 2, 2),
        num_res_units=2,
    ).to(device)
    if hvd.rank() == 0:
        # load model parameters for evaluation
        model.load_state_dict(torch.load("final_model.pth"))
    # Horovod broadcasts parameters
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    model.eval()
    with torch.no_grad():
        # define PyTorch Tensor to record metrics result at each GPU
        # the first value is `sum` of all dice metric, the second value is `count` of not_nan items
        metric = torch.zeros(2, dtype=torch.float, device=device)
        for val_data in val_loader:
            val_images, val_labels = val_data["img"].to(
                device), val_data["seg"].to(device)
            # define sliding window size and batch size for windows inference
            roi_size = (96, 96, 96)
            sw_batch_size = 4
            val_outputs = sliding_window_inference(val_images, roi_size,
                                                   sw_batch_size, model)
            val_outputs = post_trans(val_outputs)
            value, not_nans = dice_metric(y_pred=val_outputs, y=val_labels)
            value = value.squeeze()
            metric[0] += value * not_nans
            metric[1] += not_nans
        # synchronizes all processes and reduce results
        print(
            f"metric in rank {hvd.rank()}: sum={metric[0].item()}, count={metric[1].item()}"
        )
        avg_metric = hvd.allreduce(metric, name="mean_dice")
        if hvd.rank() == 0:
            print(
                f"average metric: sum={avg_metric[0].item()}, count={avg_metric[1].item()}"
            )
            print("evaluation metric:", (avg_metric[0] / avg_metric[1]).item())
    def step(self, closure=None, epoch=None):
        """Perform one K-FAC step

        Note:
        - this function should always be called before `optimizer.step()`
        - gradients must be averaged across ranks before calling `step()`

        Args:
          closure: for compatibility with the base optimizer class.
              `closure` is ignored by KFAC
          epoch (int, optional): epoch to use for determining when to end
              the `diag_warmup` period. `epoch` is not necessary if not using
              `diag_warmup`
        """

        # Update params, used for compatibilty with `KFACParamScheduler`
        group = self.param_groups[0]
        self.lr = group['lr']
        self.damping = group['damping']
        self.fac_update_freq = group['fac_update_freq']
        self.kfac_update_freq = group['kfac_update_freq']
        #print('fac_update_freq: ', self.fac_update_freq)
        #print('kfac_update_freq: ', self.kfac_update_freq)

        updates = {}
        handles = []

        if epoch is None:
            if self.diag_warmup > 0:
                print("WARNING: diag_warmup > 0 but epoch was not passed to "
                      "KFAC.step(). Defaulting to no diag_warmup")
            diag_blocks = self.diag_blocks
        else:
            diag_blocks = self.diag_blocks if epoch >= self.diag_warmup else 1

        if hvd.size() > 1 and self.steps % self.fac_update_freq == 0:
            self.fw_merged_comm.synchronize()
            self.bw_merged_comm.synchronize()

            # Compute A and G after aggregation of a and g
            for module in self.modules:
                a = self.m_a[module]
                g = self.m_g[module]
                if hvd.rank() == 0:
                    logger.info('a Name: %s, shape %s', module, a.shape)
                    logger.info('g Name: %s, shape %s', module, g.shape)
                A = torch.einsum('ki,kj->ij', a, a / a.size(0))
                G = torch.einsum('ki,kj->ij', g, g / g.size(0))
                update_running_avg(A, self.m_A[module], self.factor_decay)
                update_running_avg(G, self.m_G[module], self.factor_decay)
            raise

        # if we are switching from no diag approx to approx, we need to clear
        # off-block-diagonal elements
        if not self.have_cleared_Q and \
                epoch == self.diag_warmup and \
                self.steps % self.kfac_update_freq == 0:
            self._clear_eigen()
            self.have_cleared_Q = True

        if self.steps % self.kfac_update_freq == 0:
            # reset rank iter so device get the same layers
            # to compute to take advantage of caching
            self.rank_iter.reset()
            handles = []

            #eigen_ranks = self._generate_eigen_ranks(epoch)
            eigen_ranks = self._generate_eigen_ranks_uniform(epoch)
            #eigen_ranks = self._generate_eigen_ranks_naive(epoch)
            #inverse_As = []
            #A_ranks = []
            #inverse_Gs = []
            #G_ranks = []
            rank_to_tensors = {}

            for module in self.modules:
                ranks_a, ranks_g = eigen_ranks[module]
                self.m_dA_ranks[module] = ranks_a[0]
                self.m_dG_ranks[module] = ranks_g[0]
                rank_a = ranks_a[0]
                rank_g = ranks_g[0]

                name = self.module_name_map[module]
                self._update_inverse_A(module, ranks_a)
                #if hvd.size() > 1 and rank_a >= 0:
                #    self.inverseA_merged_comm.bcast_async_(name, self.m_QA[module], rank_a)

                self._update_inverse_G(module, ranks_g)
                #if hvd.size() > 1 and rank_g >= 0:
                #    self.inverseG_merged_comm.bcast_async_(name, self.m_QG[module], rank_g)
                #if rank_a not in rank_to_tensors:
                #    rank_to_tensors[rank_a] = []
                #rank_to_tensors[rank_a].append((name, self.m_QA[module], self.m_QG[module]))
                if hvd.size() > 1 and rank_g >= 0:
                    self.multi_comm.bcast_async_(
                        [name], [self.m_QA[module], self.m_QG[module]], rank_g)
            #if hvd.size() > 1:
            #    for rank in rank_to_tensors.keys():
            #        names = []
            #        tensors = []
            #        for name, ta, tb in rank_to_tensors[rank]:
            #            names.append(name)
            #            tensors.append(ta)
            #            tensors.append(tb)
            #        self.multi_comm.bcast_async_(names, tensors, rank)

        if hvd.size() > 1 and self.steps % self.kfac_update_freq == 0:
            #self.inverseA_merged_comm.synchronize()
            #self.inverseG_merged_comm.synchronize()
            self.multi_comm.synchronize()

        for i, module in enumerate(self.modules):

            grad = self._get_grad(module)
            precon_grad = self._get_preconditioned_grad(module, grad)
            updates[module] = precon_grad

        self._update_scale_grad(updates)

        self.steps += 1
        def hook(*ignore):
            assert p not in self._handles
            assert not p.grad.requires_grad
            name = self._parameter_names.get(p)
            p_size = np.prod(p.size())
            torch.cuda.synchronize()
            begin_time = time.time()
            if self._use_allgather and p_size > 1024:
                # fjr compress grad
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                p.grad.data.div_(hvd.size())
                if self._use_nesterov:
                    self._U[name] = torch.mul(
                        torch.add(self._U[name], p.grad.data), self._momentum)
                    self._V[name] = self._V[name] + self._U[name] + p.grad.data
                else:
                    self._U[
                        name] = self._momentum * self._U[name] + p.grad.data
                    self._V[name] = self._V[name] + self._U[name]
                compressed_val = []
                compressed_idx = []
                torch.cuda.synchronize()
                begin_select_time = time.time()
                if self._flag[name] == 1:
                    self._masks[name], compressed_val, compressed_idx = \
                        select_topk_truncated_mean(self._V[name], 0.001, self._masks[name])
                    self._flag[name] = 0
                else:
                    self._masks[name], compressed_val, compressed_idx = \
                        select_lowk_truncated_mean(self._V[name], 0.001, self._masks[name])
                    self._flag[name] = 1

                torch.cuda.synchronize()
                end_select_time = time.time()
                self.select_time += end_select_time - begin_select_time

                p.grad.data = torch.mean(compressed_val) * (1.0 -
                                                            self._masks[name])
                handle = allreduce_async_(p.grad.data, average=False)
                self._handles[p] = handle

                self._V[name].mul_(self._masks[name])
                self._U[name].mul_(self._masks[name])

                torch.cuda.synchronize()
                begin_comm_time = time.time()

                #if hvd.size() > 1:
                #    self._compressed_msg_size[name] = len(compressed_idx)
                #    if self._use_gpu:
                #        compressed_msg = torch.cat([compressed_idx.type('torch.cuda.FloatTensor'), compressed_val])
                #    else:
                #        compressed_msg = torch.cat([compressed_idx.type('torch.FloatTensor'), compressed_val])

                #    handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name)
                #    self._handles[p] = handle

                torch.cuda.synchronize()
                end_comm_time = time.time()
                self.pack_time += end_comm_time - begin_comm_time

            else:
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                if self._use_nesterov:
                    self._U[name] = torch.mul(
                        torch.add(self._U[name], p.grad.data), self._momentum)
                    #self._V[name] = self._U[name] + p.grad.data
                    p.grad.data = self._U[name] + p.grad.data
                else:
                    self._U[
                        name] = self._momentum * self._U[name] + p.grad.data
                    #self._V[name] = self._U[name]
                    p.grad.data = self._U[name]
                #compressed_msg = torch.randn(100).cuda()
                #handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name)
                if hvd.size() > 1:
                    handle = allreduce_async_(p.grad.data,
                                              average=True,
                                              name=name)
                    self._handles[p] = handle

            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time
Example #40
0
        # set model
        model = torchnet.Net()
        # load model to predict
        if prediction:
            model.load_state_dict(torch.load(this_path + "/torchmodel.pth"))
    elif args.net_name is None:
        # set model
        model = netdataloader.Net()
        # load model to predict
        if prediction:
            model.load_state_dict(torch.load(this_path + "/torchmodel.pth"))

    ##### HOROVOD #####
    if prediction:
        test_sampler = torch.utils.data.distributed.DistributedSampler(
            test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    else:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
        if validation:
            valid_sampler = torch.utils.data.distributed.DistributedSampler(
                valid_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}
    # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent
    # issues with Infiniband implementations that are not fork-safe
    if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context')
            and mp._supports_context
            and 'forkserver' in mp.get_all_start_methods()):
        kwargs['multiprocessing_context'] = 'forkserver'

    if prediction:
Example #41
0
def train():
    os.environ["CUDA_VISIBLE_DEVICES"] = '6, 7'
    args = parse_args()
    # 1. 初始化horovod
    hvd.init()
    # 2. 给当前进程分配对应的gpu,local_rank()返回的是当前是第几个进程
    torch.cuda.set_device(hvd.local_rank())

    # torch.cuda.set_device(args.local_rank)
    # dist.init_process_group(
    #             backend = 'nccl',
    #             init_method = 'tcp://127.0.0.1:33271',
    #             world_size = 2,
    #             world_size = torch.cuda.device_count(),
    # rank=args.local_rank
    # )
    setup_logger(respth)

    ## dataset
    n_classes = 19
    n_img_per_gpu = 8
    n_workers = 4
    cropsize = [1024, 1024]
    ds = CityScapes('/dataset/cityscapes/leftImg8bit_trainvaltest',
                    cropsize=cropsize,
                    mode='train')
    sampler = torch.utils.data.distributed.DistributedSampler(
        ds, num_replicas=hvd.size(), rank=hvd.rank())
    dl = DataLoader(ds,
                    batch_size=n_img_per_gpu,
                    shuffle=False,
                    sampler=sampler,
                    num_workers=n_workers,
                    pin_memory=True,
                    drop_last=True)

    ## model
    ignore_idx = 255
    net = BiSeNet(n_classes=n_classes)
    net.cuda()
    # 5. 初始化的时候广播参数,这个是为了在一开始的时候同步各个gpu之间的参数
    hvd.broadcast_parameters(net.state_dict(), root_rank=0)
    net.train()
    # net = nn.parallel.DistributedDataParallel(net,
    #         device_ids = [args.local_rank, ],
    #         output_device = args.local_rank
    #         )
    score_thres = 0.7
    n_min = n_img_per_gpu * cropsize[0] * cropsize[1] // 16
    criteria_p = OhemCELoss(thresh=score_thres,
                            n_min=n_min,
                            ignore_lb=ignore_idx)
    criteria_16 = OhemCELoss(thresh=score_thres,
                             n_min=n_min,
                             ignore_lb=ignore_idx)
    criteria_32 = OhemCELoss(thresh=score_thres,
                             n_min=n_min,
                             ignore_lb=ignore_idx)

    ## optimizer
    momentum = 0.9
    weight_decay = 5e-4
    lr_start = 1e-2
    max_iter = 80000
    power = 0.9
    warmup_steps = 1000
    warmup_start_lr = 1e-5
    optim = Optimizer(model=net,
                      lr0=lr_start,
                      momentum=momentum,
                      wd=weight_decay,
                      warmup_steps=warmup_steps,
                      warmup_start_lr=warmup_start_lr,
                      max_iter=max_iter,
                      power=power)
    hvd.broadcast_optimizer_state(optim.optim, root_rank=0)
    optim = hvd.DistributedOptimizer(optim.optim,
                                     named_parameters=net.named_parameters())

    ## train loop
    msg_iter = 50
    loss_avg = []
    st = glob_st = time.time()
    diter = iter(dl)
    epoch = 0
    for it in range(max_iter):
        try:
            im, lb = next(diter)
            if not im.size()[0] == n_img_per_gpu: raise StopIteration
        except StopIteration:
            epoch += 1
            sampler.set_epoch(epoch)
            diter = iter(dl)
            im, lb = next(diter)
        im = im.cuda()
        lb = lb.cuda()
        H, W = im.size()[2:]
        lb = torch.squeeze(lb, 1)

        optim.zero_grad()
        out, out16, out32 = net(im)
        lossp = criteria_p(out, lb)
        loss2 = criteria_16(out16, lb)
        loss3 = criteria_32(out32, lb)
        loss = lossp + loss2 + loss3
        loss.backward()
        optim.step()

        loss_avg.append(loss.item())
        ## print training log message
        if (it + 1) % msg_iter == 0:
            loss_avg = sum(loss_avg) / len(loss_avg)
            lr = optim.lr
            ed = time.time()
            t_intv, glob_t_intv = ed - st, ed - glob_st
            eta = int((max_iter - it) * (glob_t_intv / it))
            eta = str(datetime.timedelta(seconds=eta))
            msg = ', '.join([
                'it: {it}/{max_it}',
                'lr: {lr:4f}',
                'loss: {loss:.4f}',
                'eta: {eta}',
                'time: {time:.4f}',
            ]).format(it=it + 1,
                      max_it=max_iter,
                      lr=lr,
                      loss=loss_avg,
                      time=t_intv,
                      eta=eta)
            logger.info(msg)
            loss_avg = []
            st = ed

    ## dump the final model
    save_pth = osp.join(respth, 'model_final.pth')
    net.cpu()
    state = net.module.state_dict() if hasattr(net,
                                               'module') else net.state_dict()
    if dist.get_rank() == 0: torch.save(state, save_pth)
    logger.info('training done, model saved to: {}'.format(save_pth))
Example #42
0
def main():
    # Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size',
                        type=int,
                        default=64,
                        metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size',
                        type=int,
                        default=1000,
                        metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs',
                        type=int,
                        default=10,
                        metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.01,
                        metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum',
                        type=float,
                        default=0.5,
                        metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        metavar='S',
                        help='random seed (default: 42)')
    parser.add_argument(
        '--log-interval',
        type=int,
        default=10,
        metavar='N',
        help='how many batches to wait before logging training status')
    parser.add_argument('--fp16-allreduce',
                        action='store_true',
                        default=False,
                        help='use fp16 compression during allreduce')
    parser.add_argument('--results_path',
                        type=str,
                        help="Path to store results")
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    # Horovod: initialize library.
    hvd.init()
    torch.manual_seed(args.seed)

    if args.cuda:
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())
        torch.cuda.manual_seed(args.seed)

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    train_dataset = \
        datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True,
                    transform=transforms.Compose([
                        transforms.ToTensor(),
                        transforms.Normalize((0.1307,), (0.3081,))
                    ]))

    # Horovod: use DistributedSampler to partition the training data.
    train_sampler = torch.utils.data.distributed.DistributedSampler(
        train_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               sampler=train_sampler,
                                               **kwargs)

    test_dataset = \
        datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307,), (0.3081,))
        ]))

    # Horovod: use DistributedSampler to partition the test data.
    test_sampler = torch.utils.data.distributed.DistributedSampler(
        test_dataset, num_replicas=hvd.size(), rank=hvd.rank())
    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.test_batch_size,
                                              sampler=test_sampler,
                                              **kwargs)

    model = Net()

    if args.cuda:
        # Move model to GPU.
        model.cuda()

    # Horovod: broadcast parameters.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)

    global optimizer
    # Horovod: scale learning rate by the number of GPUs.
    optimizer = optim.SGD(model.parameters(),
                          lr=args.lr * hvd.size(),
                          momentum=args.momentum)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression)

    #for epoch in range(1, args.epochs + 1):
    #    train(epoch, model, optimizer, train_sampler, train_loader, args)
    #    test(model, test_sampler, test_loader, args)

    checkpoint_file = os.path.join(args.results_path, f'skopt_torch_results')
    checkpoint_saver = CheckpointSaver(checkpoint_file, compress=9)

    space = Space([(2, 8)])

    try:
        res = load(checkpoint_file)
        x0 = res.x_iters
        y0 = res.func_vals
    except FileNotFoundError:
        print(f'No previous save point.')
        # Need to randomly sample the bounds to prime the optimization.
        x0 = space.rvs(1)
        y0 = None

    gp_minimize(
        lambda x: objective(x, model, train_sampler, train_loader, args
                            ),  # the function to minimize
        space,  # the bounds on each dimension of x
        x0=x0,  # already examined values for x
        y0=y0,  # observed values for x0
        acq_func="LCB",  # the acquisition function (optional)
        n_calls=20,  # the number of evaluations of f including at x0
        n_random_starts=0,  # the number of random initialization points
        callback=[checkpoint_saver],
        random_state=777)
Example #43
0
def main(args):
    hvd.init()

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    if args.cuda:
        device = torch.device('cuda')
        # Horovod: pin GPU to local rank.
        torch.cuda.set_device(hvd.local_rank())

    device = 'GPU' if args.cuda else 'CPU'
    if hvd.rank() == 0:
        log('Using PyTorch version: %s, Device: %s' %
            (torch.__version__, device))
        log('Horovod version: %s, CUDA: %s, ROCM: %s, NCCL: %s, MPI: %s' %
            (horovod.__version__, hvd.cuda_built(), hvd.rocm_built(),
             hvd.nccl_built(), hvd.mpi_built()))
        log(torch.__config__.show())

    cudnn.benchmark = True

    # Set up standard model.
    log('Initializing %s model...' % args.model)
    model = getattr(models, args.model)()

    # By default, Adasum doesn't need scaling up learning rate.
    lr_scaler = hvd.size() if not args.use_adasum else 1

    if args.cuda:
        # Move model to GPU.
        model.cuda()
        # If using GPU Adasum allreduce, scale learning rate by local_size.
        if args.use_adasum and hvd.nccl_built():
            lr_scaler = hvd.local_size()

    optimizer = optim.SGD(model.parameters(), lr=0.01 * lr_scaler)

    # Horovod: (optional) compression algorithm.
    compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none

    # Horovod: wrap optimizer with DistributedOptimizer.
    optimizer = hvd.DistributedOptimizer(
        optimizer,
        named_parameters=model.named_parameters(),
        compression=compression,
        op=hvd.Adasum if args.use_adasum else hvd.Average)

    # Horovod: broadcast parameters & optimizer state.
    hvd.broadcast_parameters(model.state_dict(), root_rank=0)
    hvd.broadcast_optimizer_state(optimizer, root_rank=0)

    if args.fixed_data:
        data, target = generate_data(args)

    def benchmark_step():
        nonlocal data, target

        if not args.fixed_data:
            data, target = generate_data(args)

        optimizer.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optimizer.step()

    log('Model: %s' % args.model)
    log('Batch size: %d' % args.batch_size)
    log('Number of %ss: %d' % (device, hvd.size()))

    # Warm-up
    log('Running warmup...')
    timeit.timeit(benchmark_step, number=args.num_warmup_batches)

    # Benchmark
    log('Running benchmark...')
    img_secs = []
    for x in range(args.num_iters):
        time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
        img_sec = args.batch_size * args.num_batches_per_iter / time
        log('Iter #%d: %.1f img/sec per %s' % (x, img_sec, device))
        img_secs.append(img_sec)

    # Results
    img_sec_mean = np.mean(img_secs)
    img_sec_conf = 1.96 * np.std(img_secs)
    log('Img/sec per %s: %.1f +-%.1f' % (device, img_sec_mean, img_sec_conf))
    log('Total img/sec on %d %s(s): %.1f +-%.1f' %
        (hvd.size(), device, hvd.size() * img_sec_mean,
         hvd.size() * img_sec_conf))
Example #44
0
def on_state_reset():
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr * hvd.size()
def run_test_from_config(trainer_options, on_gpu, check_size):
    """Trains the default model with the given config."""
    set_random_main_port()
    reset_seed()

    ckpt_path = trainer_options["default_root_dir"]
    trainer_options.update(callbacks=[ModelCheckpoint(dirpath=ckpt_path)])

    class TestModel(BoringModel):
        def on_train_start(self) -> None:
            expected_device = torch.device(
                "cuda",
                self.trainer.local_rank) if on_gpu else torch.device("cpu")
            assert self.device == expected_device

        def training_epoch_end(self, outputs) -> None:
            res = self.trainer.strategy.reduce(torch.tensor(
                1.0, device=self.device),
                                               reduce_op="sum")
            assert res.sum() == self.trainer.strategy.world_size

    model = TestModel()
    trainer = Trainer(**trainer_options)

    trainer.fit(model)
    assert trainer.state.finished, f"Training failed with {trainer.state}"
    trainer.test(model)

    assert model.device == torch.device("cpu")

    # Horovod should be initialized following training. If not, this will raise an exception.
    if check_size:
        assert hvd.size() == 2

    if trainer.global_rank > 0:
        return

    # test model loading
    pretrained_model = BoringModel.load_from_checkpoint(
        trainer.checkpoint_callback.best_model_path)

    # test new model accuracy
    test_loaders = model.test_dataloader()
    if not isinstance(test_loaders, list):
        test_loaders = [test_loaders]

    for dataloader in test_loaders:
        batch = next(iter(dataloader))
        pretrained_model(batch)

    # test HPC saving
    # save logger to make sure we get all the metrics
    if trainer.logger:
        trainer.logger.finalize("finished")
    hpc_save_path = trainer._checkpoint_connector.hpc_save_path(ckpt_path)
    trainer.save_checkpoint(hpc_save_path)
    # test HPC loading
    checkpoint_path = trainer._checkpoint_connector._CheckpointConnector__get_max_ckpt_path_from_folder(
        ckpt_path)
    trainer._checkpoint_connector.restore(checkpoint_path)

    if on_gpu:
        trainer = Trainer(accelerator="gpu",
                          devices=1,
                          strategy="horovod",
                          max_epochs=1)
        # test root gpu index
        assert trainer.strategy.root_device.index == hvd.local_rank()
Example #46
0
 def test_horovod_size(self):
     """Test that the size returned by hvd.size() is correct."""
     _, true_size = mpi_env_rank_and_size()
     hvd.init()
     size = hvd.size()
     assert true_size == size
Example #47
0
        def hook(*ignore):
            assert p not in self._handles
            assert not p.grad.requires_grad
            name = self._parameter_names.get(p)
            p_size = np.prod(p.size())
            torch.cuda.synchronize()
            begin_time = time.time()

            if self._use_allgather and p_size > self._plan1:
                torch.cuda.synchronize()
                begin_mom_time = time.time()

                weight_decay = self._weight_decay  #group['weight_decay']
                momentum = self._momentum  #group['momentum']
                dampening = 0.0  #group['dampening']
                d_p = p.grad.data
                d_p.div_(hvd.size())
                if weight_decay != 0:
                    d_p.add_(weight_decay, p.data)
                if momentum != 0:
                    param_state = self.state[p]
                    if 'momentum_buffer' not in param_state:
                        buf = param_state[
                            'momentum_buffer'] = torch.zeros_like(p.data)
                        buf.mul_(momentum).add_(d_p)
                    else:
                        buf = param_state['momentum_buffer']
                        buf.mul_(momentum).add_(1 - dampening, d_p)
                    #TODO
                if 'residue_buffer' not in param_state:
                    rsd = param_state['residue_buffer'] = torch.zeros_like(
                        p.data)
                    rsd.add_(param_state['momentum_buffer'])
                    if self._use_nesterov:
                        rsd = rsd.add(momentum, d_p)
                else:
                    rsd = param_state['residue_buffer']
                    rsd.add_(param_state['momentum_buffer'])
                    if self._use_nesterov:
                        rsd = rsd.add(momentum, d_p)

                torch.cuda.synchronize()
                self.mom_time += time.time() - begin_mom_time

                compressed_val = []
                compressed_idx = []

                torch.cuda.synchronize()
                begin_select_time = time.time()

                if 'mid_store' not in param_state:
                    param_state['mid_store'] = 0.0
                if 'interval' not in param_state:
                    param_state['interval'] = 10
                it = 0
                sparsity = 0.0

                if p_size > self._plan3:
                    compressed_val, compressed_idx, it, _, sparsity = \
                        select_top_k_thdv3(param_state['residue_buffer'], 0.001)
                elif p_size > self._plan2:
                    compressed_val, compressed_idx = \
                            select_trim_topk(param_state['residue_buffer'], 0.001)
                else:
                    compressed_val, compressed_idx = \
                            select_topk(param_state['residue_buffer'], 0.001)

                assert (len(compressed_idx) > 0)
                torch.cuda.synchronize()
                end_select_time = time.time()
                self.select_time += end_select_time - begin_select_time
                #if param_state['interval'] == 10:
                #    compressed_val, compressed_idx, it, param_state['mid_store'], sparsity = \
                #            select_top_k_thdv3(param_state['residue_buffer'], 0.001)
                #    param_state['interval'] = 0
                #else:
                #    compressed_val, compressed_idx, sparsity = \
                #            select_top_k_fixthd(param_state['residue_buffer'], param_state['mid_store'])
                #    param_state['interval'] += 1
                #if hvd.rank() == 0:
                #    print(name, p.size())
                #if hvd.rank() == 0 and name == "features.27.weight":
                #if name == "features.27.weight":
                #    torch.save(compressed_val, 'compressed_val' + str(local_rank()))
                #    torch.save(compressed_idx, 'compressed_idx' + str(local_rank()))
                #if hvd.rank() == 0 and name == "features.27.weight":
                #    self._it = it
                #    self._mid = param_state['mid_store']
                #    self._sparsity = sparsity
                #tmp_t = torch.tensor([local_len], dtype=torch.long)
                #                tmp_t = torch.tensor([local_len])
                # print("len list, ", global_len_list)
                #local_len = torch.min(global_len_list)
                ##print("local_len, ", local_len)
                #compressed_val = compressed_val[0:local_len]
                #compressed_idx = compressed_idx[0:local_len]

                torch.cuda.synchronize()
                begin_mask_time = time.time()

                masks_size = self._masks[name].size()
                self._masks[name].zero_()
                self._masks[name] = self._masks[name].view(-1)
                self._masks[name][compressed_idx] = 1.0

                self._masks[name] = 1.0 - self._masks[name]
                self._masks[name] = self._masks[name].view(masks_size)

                if self._debug:
                    self._v_ref[name] = param_state['residue_buffer'] * (
                        1.0 - self._masks[name])
                    allreduce_(self._v_ref[name], average=False)

                if hvd.size() == 1:
                    p.grad.data = param_state['residue_buffer'] * (
                        1.0 - self._masks[name])

                param_state['residue_buffer'].mul_(self._masks[name])
                param_state['momentum_buffer'].mul_(self._masks[name])

                end_mask_time = time.time()
                self.mask_time += end_mask_time - begin_mask_time

                torch.cuda.synchronize()
                begin_pack_time = time.time()

                if hvd.size() > 1:
                    if self._use_gpu:
                        if p_size > self._plan3:
                            compressed_msg = torch.cat([\
                                    torch.tensor([len(compressed_idx)]).type('torch.cuda.FloatTensor'),\
                                    compressed_idx.type('torch.cuda.FloatTensor'), \
                                    compressed_val])
                            handle = _allgather_async(
                                compressed_msg,
                                self._compressed_msg[name],
                                name=name)
                        else:
                            self._compressed_msg_size[name] = len(
                                compressed_idx)
                            compressed_msg = torch.cat([compressed_idx.type('torch.cuda.FloatTensor'), \
                                compressed_val])
                            handle = _allgather_async(
                                compressed_msg,
                                self._compressed_msg[name],
                                name=name)
                    self._handles[p] = handle

                torch.cuda.synchronize()
                self.pack_time += time.time() - begin_pack_time
            else:
                #compressed_msg = torch.randn(100).cuda()
                #handle = _allgather_async(compressed_msg, self._compressed_msg[name], name=name)
                torch.cuda.synchronize()
                begin_allreduce_time = time.time()
                p.grad.data.div_(hvd.size())
                p.grad.data.add_(torch.mul(p.data, self._weight_decay))
                param_state = self.state[p]
                if 'momentum_buffer' not in param_state:
                    buf = param_state['momentum_buffer'] = torch.zeros_like(
                        p.data)
                else:
                    buf = param_state['momentum_buffer']
                if self._use_nesterov:
                    buf = torch.mul(torch.add(buf, p.grad.data),
                                    self._momentum)
                    p.grad.data.add_(buf)
                    #param_state['momentum_buffer'] = torch.mul(torch.add(param_state['momentum_buffer'], p.grad.data), self._momentum)
                    #p.grad.data.add_(param_state['momentum_buffer'])
                else:
                    param_state[
                        'momentum_buffer'] = self._momentum * param_state[
                            'momentum_buffer'] + p.grad.data
                    p.grad.data = param_state['momentum_buffer']
                if hvd.size() > 1:
                    handle = allreduce_async_(p.grad.data,
                                              average=False,
                                              name=name)
                    self._handles[p] = handle
                torch.cuda.synchronize()
                self.allreduce_time += time.time() - begin_allreduce_time

            torch.cuda.synchronize()
            end_time = time.time()
            self.pruning_time += end_time - begin_time