def run(self):
        self.device.use()

        self.setup()

        while True:
            job, data = self.pipe.recv()
            if job == 'finalize':
                self.device.device.synchronize()
                break
            if job == 'update':
                # For reducing memory
                self.model.cleargrads()

                model = self.model
                model.cleargrads()

                x = self.converter(self.iterator.next(), self.device_index)
                batch_size = len(x)
                images = x['image']
                viewpoints = x['viewpoint']
                xp = model.xp

                representation, query_images, query_viewpoints = encode_scene(
                    images, viewpoints, model, self.device_index)

                with self.reporter.scope({}):  # pass dummy observation
                    # Compute distribution parameterws
                    (z_t_param_array,
                     pixel_mean) = model.sample_z_and_x_params_from_posterior(
                         query_images, query_viewpoints, representation)

                    # Compute ELBO
                    (ELBO, bits_per_pixel, negative_log_likelihood,
                     kl_divergence) = estimate_ELBO(xp, query_images,
                                                    z_t_param_array,
                                                    pixel_mean,
                                                    self.pixel_log_sigma,
                                                    batch_size)

                    # Update parameters
                    loss = -ELBO

                loss.backward()
                del loss

                gg = gather_grads(self.model)
                nccl_data_type = _get_nccl_data_type(gg.dtype)
                null_stream = cuda.Stream.null
                self.comm.reduce(gg.data.ptr, gg.data.ptr, gg.size,
                                 nccl_data_type, nccl.NCCL_SUM, 0,
                                 null_stream.ptr)
                del gg
                self.model.cleargrads()
                gp = gather_params(self.model)
                nccl_data_type = _get_nccl_data_type(gp.dtype)
                self.comm.bcast(gp.data.ptr, gp.size, nccl_data_type, 0,
                                null_stream.ptr)
                scatter_params(self.model, gp)
                del gp
Exemple #2
0
    def update_core(self):
        self.setup_workers()

        self._send_message(('update', None))
        with cuda.Device(self._devices[0]):
            # For reducing memory
            self._master.cleargrads()

            optimizer = self.get_optimizer('main')
            batch = self.get_iterator('main').next()
            batch = self.converter(batch, self._devices[0])

            loss = self._calc_loss(self._master,
                                   batch,
                                   cleargrads_func=self._master.cleargrads)

            self._master.cleargrads()
            loss.backward()

            # NCCL: reduce grads
            null_stream = cuda.Stream.null
            if self.comm is not None:
                gg = gather_grads(self._master)
                nccl_data_type = _get_nccl_data_type(gg.dtype)
                self.comm.reduce(gg.data.ptr, gg.data.ptr, gg.size,
                                 nccl_data_type, nccl.NCCL_SUM, 0,
                                 null_stream.ptr)
                scatter_grads(self._master, gg)
                del gg
            optimizer.update()
            if self.comm is not None:
                gp = gather_params(self._master)
                nccl_data_type = _get_nccl_data_type(gp.dtype)
                self.comm.bcast(gp.data.ptr, gp.size, nccl_data_type, 0,
                                null_stream.ptr)
Exemple #3
0
    def update_core(self):
        """Main Update routine of the custom parallel updater."""
        self.setup_workers()

        self._send_message(("update", None))
        with cuda.Device(self._devices[0]):
            # For reducing memory

            optimizer = self.get_optimizer("main")
            batch = self.get_iterator("main").next()
            x = self.converter(batch, self._devices[0])

            loss = self._master(*x) / self.accum_grad
            loss.backward()
            loss.unchain_backward()

            # NCCL: reduce grads
            null_stream = cuda.Stream.null
            if self.comm is not None:
                gg = gather_grads(self._master)
                self.comm.reduce(
                    gg.data.ptr,
                    gg.data.ptr,
                    gg.size,
                    self.nccl.NCCL_FLOAT,
                    self.nccl.NCCL_SUM,
                    0,
                    null_stream.ptr,
                )
                scatter_grads(self._master, gg)
                del gg

            # update parameters
            self.forward_count += 1
            if self.forward_count != self.accum_grad:
                return
            self.forward_count = 0
            # check gradient value
            grad_norm = np.sqrt(
                sum_sqnorm([p.grad for p in optimizer.target.params(False)]))
            logging.info("grad norm={}".format(grad_norm))

            # update
            if math.isnan(grad_norm):
                logging.warning("grad norm is nan. Do not update model.")
            else:
                optimizer.update()
            self._master.cleargrads()

            if self.comm is not None:
                gp = gather_params(self._master)
                self.comm.bcast(gp.data.ptr, gp.size, self.nccl.NCCL_FLOAT, 0,
                                null_stream.ptr)
Exemple #4
0
    def update_core(self):
        self.setup_workers()

        self._send_message(('update', None))
        with cuda.Device(self._devices[0]):
            from cupy.cuda import nccl
            # For reducing memory
            self._master.cleargrads()

            optimizer = self.get_optimizer('main')
            batch = self.get_iterator('main').next()
            x = converter_kaldi(batch[0], self.reader)

            loss = self._master(x)

            self._master.cleargrads()
            loss.backward()
            loss.unchain_backward()

            # NCCL: reduce grads
            null_stream = cuda.Stream.null
            if self.comm is not None:
                gg = gather_grads(self._master)
                self.comm.reduce(gg.data.ptr, gg.data.ptr, gg.size,
                                 nccl.NCCL_FLOAT,
                                 nccl.NCCL_SUM,
                                 0, null_stream.ptr)
                scatter_grads(self._master, gg)
                del gg

            # check gradient value
            grad_norm = np.sqrt(self._sum_sqnorm(
                [p.grad for p in optimizer.target.params(False)]))
            logging.info('grad norm={}'.format(grad_norm))

            # update
            if math.isnan(grad_norm):
                logging.warning('grad norm is nan. Do not update model.')
            else:
                optimizer.update()

            if self.comm is not None:
                gp = gather_params(self._master)
                self.comm.bcast(gp.data.ptr, gp.size, nccl.NCCL_FLOAT,
                                0, null_stream.ptr)

            delete_feat(x)
    def test_gather_scatter_grads(self):
        cupy = cuda.cupy
        model0 = SimpleNet(dtype=self.dtype)
        model1 = copy.deepcopy(model0)

        with testing.assert_warns(DeprecationWarning):
            model0.to_gpu()
        with testing.assert_warns(DeprecationWarning):
            model1.to_gpu()

        optimizer0 = chainer.optimizers.SGD(lr=1.0)
        optimizer0.setup(model0)

        optimizer1 = chainer.optimizers.SGD(lr=1.0)
        optimizer1.setup(model1)

        bsize = 8

        x = numpy.random.uniform(0, 1, (bsize, 2, 5, 5)).astype(self.dtype)
        t = numpy.empty(bsize, dtype=numpy.int32)
        for i in range(bsize):
            t[i] = i % 2

        x = chainer.Variable(chainer.backends.cuda.to_gpu(x))
        t = chainer.Variable(chainer.backends.cuda.to_gpu(t))

        loss0 = model0(x, t)

        model0.cleargrads()
        model1.cleargrads()

        loss0.backward()
        gg0 = mpu.gather_grads(model0)
        mpu.scatter_grads(model1, gg0)

        cupy.testing.assert_array_equal(model0.conv.W.grad, model1.conv.W.grad)
        cupy.testing.assert_array_equal(model0.conv.b.grad, model1.conv.b.grad)
        cupy.testing.assert_array_equal(model0.fc.W.grad, model1.fc.W.grad)
        cupy.testing.assert_array_equal(model0.fc.b.grad, model1.fc.b.grad)

        optimizer0.update()
        optimizer1.update()

        cupy.testing.assert_array_equal(model0.conv.W.data, model1.conv.W.data)
        cupy.testing.assert_array_equal(model0.conv.b.data, model1.conv.b.data)
        cupy.testing.assert_array_equal(model0.fc.W.data, model1.fc.W.data)
        cupy.testing.assert_array_equal(model0.fc.b.data, model1.fc.b.data)
    def test_gather_scatter_grads(self):
        cupy = cuda.cupy
        model0 = SimpleNet(dtype=self.dtype)
        model1 = copy.deepcopy(model0)

        model0.to_gpu()
        model1.to_gpu()

        optimizer0 = chainer.optimizers.SGD(lr=1.0)
        optimizer0.setup(model0)

        optimizer1 = chainer.optimizers.SGD(lr=1.0)
        optimizer1.setup(model1)

        bsize = 8

        x = numpy.random.uniform(0, 1, (bsize, 2, 5, 5)).astype(self.dtype)
        t = numpy.empty(bsize, dtype=numpy.int32)
        for i in range(bsize):
            t[i] = i % 2

        x = chainer.Variable(chainer.backends.cuda.to_gpu(x))
        t = chainer.Variable(chainer.backends.cuda.to_gpu(t))

        loss0 = model0(x, t)

        model0.cleargrads()
        model1.cleargrads()

        loss0.backward()
        gg0 = mpu.gather_grads(model0)
        mpu.scatter_grads(model1, gg0)

        cupy.testing.assert_array_equal(model0.conv.W.grad, model1.conv.W.grad)
        cupy.testing.assert_array_equal(model0.conv.b.grad, model1.conv.b.grad)
        cupy.testing.assert_array_equal(model0.fc.W.grad, model1.fc.W.grad)
        cupy.testing.assert_array_equal(model0.fc.b.grad, model1.fc.b.grad)

        optimizer0.update()
        optimizer1.update()

        cupy.testing.assert_array_equal(model0.conv.W.data, model1.conv.W.data)
        cupy.testing.assert_array_equal(model0.conv.b.data, model1.conv.b.data)
        cupy.testing.assert_array_equal(model0.fc.W.data, model1.fc.W.data)
        cupy.testing.assert_array_equal(model0.fc.b.data, model1.fc.b.data)
Exemple #7
0
    def run(self):
        from cupy.cuda import nccl
        dev = cuda.Device(self.device)
        dev.use()
        self.setup()
        gp = None
        while True:
            job, data = self.pipe.recv()
            if job == 'finalize':
                dev.synchronize()
                break
            if job == 'update':
                # For reducing memory
                self.model.cleargrads()

                batch = self.iterator.next()
                x = converter_kaldi(batch[0], self.reader)
                observation = {}
                with self.reporter.scope(observation):
                    loss = self.model(x)

                self.model.cleargrads()
                loss.backward()
                loss.unchain_backward()

                del loss

                gg = gather_grads(self.model)
                null_stream = cuda.Stream.null
                self.comm.reduce(gg.data.ptr, gg.data.ptr, gg.size,
                                 nccl.NCCL_FLOAT,
                                 nccl.NCCL_SUM, 0,
                                 null_stream.ptr)
                del gg
                self.model.cleargrads()
                gp = gather_params(self.model)
                self.comm.bcast(gp.data.ptr, gp.size,
                                nccl.NCCL_FLOAT, 0,
                                null_stream.ptr)
                scatter_params(self.model, gp)
                gp = None

                delete_feat(x)
 def test_gather_grads_raise_on_cpu(self):
     model = SimpleNet(dtype=self.dtype)
     with self.assertRaises(RuntimeError):
         mpu.gather_grads(model)
 def test_gather_grads_raise_on_cpu(self):
     model = SimpleNet(dtype=self.dtype)
     with self.assertRaises(RuntimeError):
         mpu.gather_grads(model)
    def update_core(self):
        self.setup_workers()
        self._send_message(('update', None))

        with chainer.using_device(self._devices[0]):
            iterator = self.get_iterator('main')
            optimizer = self.get_optimizer('main')
            model = self.model

            batch = iterator.next()
            x = self.converter(batch, self._devices[0])  #how to split devices?

            images = x['image']
            viewpoints = x['viewpoint']

            if self.start:
                model.cleargrads()
                self.start = False
            xp = model.xp
            batch_size = len(batch)

            #  For reducing memory
            model.cleargrads()

            #------------------------------------------------------------------------------
            # Scene encoder
            #------------------------------------------------------------------------------
            representation, query_images, query_viewpoints = encode_scene(
                images, viewpoints, model, self._devices[0])

            #------------------------------------------------------------------------------
            # Compute empirical ELBO
            #------------------------------------------------------------------------------
            # Compute distribution parameterws
            (z_t_param_array,
             pixel_mean) = model.sample_z_and_x_params_from_posterior(
                 query_images, query_viewpoints, representation)

            # Compute ELBO
            (ELBO, bits_per_pixel, negative_log_likelihood,
             kl_divergence) = estimate_ELBO(xp, query_images, z_t_param_array,
                                            pixel_mean, self.pixel_log_sigma,
                                            batch_size)

            #------------------------------------------------------------------------------
            # Update parameters
            #------------------------------------------------------------------------------
            loss = -ELBO

            loss.backward()
            # if start_training:
            #     g = chainer.computational_graph.build_computational_graph(pixel_mean)
            #     with open(os.path.join(args.snapshot_directory,'cg.dot'), 'w') as o:
            #         o.write(g.dump())
            #     start_training = False
            # exit(1)

            # NCCL: reduce grads
            null_stream = cuda.Stream.null
            if self.comm is not None:
                gg = gather_grads(model)
                nccl_data_type = _get_nccl_data_type(gg.dtype)
                self.comm.reduce(gg.data.ptr, gg.data.ptr, gg.size,
                                 nccl_data_type, nccl.NCCL_SUM, 0,
                                 null_stream.ptr)
                scatter_grads(model, gg)
                del gg

            optimizer.update()
            with chainer.no_backprop_mode():
                mean_squared_error = cf.mean_squared_error(
                    query_images, pixel_mean)
            reporter.report(
                {
                    'loss': float(loss.data),
                    'bits_per_pixel': float(bits_per_pixel.data),
                    'NLL': float(negative_log_likelihood.data),
                    'MSE': float(mean_squared_error.data)
                }, model)

            if self.comm is not None:
                gp = gather_params(model)
                nccl_data_type = _get_nccl_data_type(gp.dtype)
                self.comm.bcast(gp.data.ptr, gp.size, nccl_data_type, 0,
                                null_stream.ptr)