Ejemplo n.º 1
0
    def setup(self, params_grads):
        """Prepares states for all given parameter/gradient pairs.

        Args:
            params_grads: FunctionSet or tuple (pair) of two tuples.
                For tuple, the first element is a tuple of parameter arrays,
                and the second is a tuple of corresponding gradient arrays.
        """
        if hasattr(params_grads, 'parameters') and \
           hasattr(params_grads, 'gradients'):
            params = getattr(params_grads, 'parameters')
            grads = getattr(params_grads, 'gradients')

        elif isinstance(params_grads, tuple):
            params = params_grads[0]
            grads = params_grads[1]
        else:
            msg = ("'params_grads' must have 'parameters' and 'gradients'"
                   " attributes or tuples, {0} is given")
            raise ValueError(msg)

        self.t = 0
        self.tuples = []
        for p, g in zip(params, grads):
            with cuda.using_device(p):
                state = self.init_state(p, g)
                self.tuples.append((p, g, state))
Ejemplo n.º 2
0
    def setup(self, params_grads):
        """Prepares states for all given parameter/gradient pairs.

        Args:
            params_grads: FunctionSet or tuple (pair) of two tuples.
                For tuple, the first element is a tuple of parameter arrays,
                and the second is a tuple of corresponding gradient arrays.
        """
        if hasattr(params_grads, 'parameters') and \
           hasattr(params_grads, 'gradients'):
            params = getattr(params_grads, 'parameters')
            grads = getattr(params_grads, 'gradients')

        elif isinstance(params_grads, tuple):
            params = params_grads[0]
            grads = params_grads[1]
        else:
            msg = ("'params_grads' must have 'parameters' and 'gradients'"
                   " attributes or tuples, {0} is given")
            raise ValueError(msg)

        self.t = 0
        self.tuples = []
        for p, g in zip(params, grads):
            with cuda.using_device(p):
                state = self.init_state(p, g)
                self.tuples.append((p, g, state))
Ejemplo n.º 3
0
    def update(self):
        """Updates all parameters and states using corresponding gradients.

        This method iteratively calls :meth:`update_one` for each parameter/
        gradient/state tuple. Beforehand, :attr:`t` attribute is incremented.

        """
        self.t += 1
        for p, g, s in self.tuples:
            with cuda.using_device(p):
                self.update_one(p, g, s)
Ejemplo n.º 4
0
    def update(self):
        """Updates all parameters and states using corresponding gradients.

        This method iteratively calls :meth:`update_one` for each parameter/
        gradient/state tuple. Beforehand, :attr:`t` attribute is incremented.

        """
        self.t += 1
        for p, g, s in self.tuples:
            with cuda.using_device(p):
                self.update_one(p, g, s)
Ejemplo n.º 5
0
    def zero_grads(self):
        """Fills all gradient arrays by zeros.

        This method should be call before backprop takes place, since
        gradients are accumulated on backprop.

        """
        for _, g, _ in self.tuples:
            if isinstance(g, cuda.GPUArray):
                with cuda.using_device(g):
                    g.fill(0)
            else:
                g.fill(0)
Ejemplo n.º 6
0
    def zero_grads(self):
        """Fills all gradient arrays by zeros.

        This method should be call before backprop takes place, since
        gradients are accumulated on backprop.

        """
        for _, g, _ in self.tuples:
            if isinstance(g, cuda.GPUArray):
                with cuda.using_device(g):
                    g.fill(0)
            else:
                g.fill(0)
Ejemplo n.º 7
0
    def weight_decay(self, decay):
        """Applies weight decay to the parameter/gradient pairs.

        Args:
            decay (float): Coefficient of weight decay

        """
        for p, g, _ in self.tuples:
            if isinstance(p, cuda.GPUArray):
                with cuda.using_device(p):
                    cuda.elementwise('float* g, const float* p, float decay',
                                     'g[i] += decay * p[i]',
                                     'weight_decay')(g, p, decay)
            else:
                g += decay * p
Ejemplo n.º 8
0
    def weight_decay(self, decay):
        """Applies weight decay to the parameter/gradient pairs.

        Args:
            decay (float): Coefficient of weight decay

        """
        for p, g, _ in self.tuples:
            if isinstance(p, cuda.GPUArray):
                with cuda.using_device(p):
                    cuda.elementwise('float* g, const float* p, float decay',
                                     'g[i] += decay * p[i]',
                                     'weight_decay')(g, p, decay)
            else:
                g += decay * p
Ejemplo n.º 9
0
    def setup(self, params_grads):
        """Prepares states for all given parameter/gradient pairs.

        Args:
            params_grads: Tuple (pair) of two tuples. The first element is a
                tuple of parameter arrays, and the second is a tuple of
                corresponding gradient arrays.
                Return value of :meth:`FunctionSet.collect_parameters` method
                can be used.

        """
        self.t = 0
        self.tuples = []
        for p, g in zip(*params_grads):
            with cuda.using_device(p):
                state = self.init_state(p, g)
                self.tuples.append((p, g, state))
Ejemplo n.º 10
0
    def setup(self, params_grads):
        """Prepares states for all given parameter/gradient pairs.

        Args:
            params_grads: Tuple (pair) of two tuples. The first element is a
                tuple of parameter arrays, and the second is a tuple of
                corresponding gradient arrays.
                Return value of :meth:`FunctionSet.collect_parameters` method
                can be used.

        """
        self.t = 0
        self.tuples = []
        for p, g in zip(*params_grads):
            with cuda.using_device(p):
                state = self.init_state(p, g)
                self.tuples.append((p, g, state))
Ejemplo n.º 11
0
    def clip_grads(self, maxnorm):
        """Clips the norm of whole gradients up to given threshold.

        Args:
            maxnorm (float): Threshold of gradient L2 norm.

        .. seealso::

            :meth:`compute_grads_norm`
                It uses this method to compute the gradient norm to be clipped.

        """
        norm = self.compute_grads_norm()
        if norm > maxnorm:
            ratio = maxnorm / norm
            for _, g, _ in self.tuples:
                with cuda.using_device(g):
                    g *= ratio
Ejemplo n.º 12
0
    def clip_grads(self, maxnorm):
        """Clips the norm of whole gradients up to given threshold.

        Args:
            maxnorm (float): Threshold of gradient L2 norm.

        .. seealso::

            :meth:`compute_grads_norm`
                It uses this method to compute the gradient norm to be clipped.

        """
        norm = self.compute_grads_norm()
        if norm > maxnorm:
            ratio = maxnorm / norm
            for _, g, _ in self.tuples:
                with cuda.using_device(g):
                    g *= ratio
Ejemplo n.º 13
0
    def to_gpu(self, device=None):
        """Migrates the function to GPU and returns self.

        The default implementation moves all fields of type
        :class:`~numpy.ndarray` onto GPU.

        Args:
            device (int or :class:`pycuda.driver.Device` or ``None``): Device
                ID of GPU that the function will be migrated on. If this is
                ``None``, the current device is used.

        Returns:
            self.

        """
        with cuda.using_device(device):
            for k, v in six.iteritems(self.__dict__):
                if isinstance(v, numpy.ndarray):
                    setattr(self, k, cuda.to_gpu(v))
                elif (isinstance(v, cuda.GPUArray)
                      and v.gpudata.device != device):
                    setattr(self, k, cuda.copy(v, out_device=device))
        return self
Ejemplo n.º 14
0
    def to_gpu(self, device=None):
        """Migrates the function to GPU and returns self.

        The default implementation moves all fields of type
        :class:`~numpy.ndarray` onto GPU.

        Args:
            device (int or :class:`pycuda.driver.Device` or ``None``): Device
                ID of GPU that the function will be migrated on. If this is
                ``None``, the current device is used.

        Returns:
            self.

        """
        with cuda.using_device(device):
            for k, v in six.iteritems(self.__dict__):
                if isinstance(v, numpy.ndarray):
                    setattr(self, k, cuda.to_gpu(v))
                elif (isinstance(v, cuda.GPUArray) and
                      v.gpudata.device != device):
                    setattr(self, k, cuda.copy(v, out_device=device))
        return self
Ejemplo n.º 15
0
    def accumulate_grads(self, grads):
        """Accumulates gradients from other source.

        This method just adds given gradient arrays to gradients that this
        optimizer holds. It is typically used in data-parallel optimization,
        where gradients for different shards are computed in parallel and
        aggregated by this method. This method correctly treats multiple GPU
        devices.

        Args:
            grads (Iterable): Iterable of gradient arrays to be accumulated.

        """
        for (_, g_dst, _), g_src in zip(self.tuples, grads):
            if isinstance(g_dst, numpy.ndarray):
                g_dst += cuda.to_cpu(g_src)
                continue

            with cuda.using_device(g_dst):
                if (isinstance(g_src, cuda.GPUArray)
                        and g_dst.gpudata.device != g_src.gpudata.device):
                    g_dst += cuda.copy(g_src, out_device=g_src.gpudata.device)
                else:
                    g_dst += cuda.to_gpu(g_src)
Ejemplo n.º 16
0
    def accumulate_grads(self, grads):
        """Accumulates gradients from other source.

        This method just adds given gradient arrays to gradients that this
        optimizer holds. It is typically used in data-parallel optimization,
        where gradients for different shards are computed in parallel and
        aggregated by this method. This method correctly treats multiple GPU
        devices.

        Args:
            grads (Iterable): Iterable of gradient arrays to be accumulated.

        """
        for (_, g_dst, _), g_src in zip(self.tuples, grads):
            if isinstance(g_dst, numpy.ndarray):
                g_dst += cuda.to_cpu(g_src)
                continue

            with cuda.using_device(g_dst):
                if (isinstance(g_src, cuda.GPUArray) and
                        g_dst.gpudata.device != g_src.gpudata.device):
                    g_dst += cuda.copy(g_src, out_device=g_src.gpudata.device)
                else:
                    g_dst += cuda.to_gpu(g_src)
Ejemplo n.º 17
0
    def __call__(self, *inputs):
        """Applies forward propagation with chaining backward references.

        Basic behavior is also expressed in documentation of :class:`Function`
        class. This function first copies itself to avoid conflict over
        multiple invokations.

        .. note::

           If the :data:`~Variable.data` attribute of input variables reside on
           GPU device, then, before it calls :meth:`forward` method, the
           appropriate device is selected, so in most cases implementor does
           not need to take care of device selection.

        Args:
            inputs: Tuple of input :class:`Variable` objects. All input
                variables must have same volatile flag.

        Returns:
            One
            :class:`Variable` object or a tuple of multiple
            :class:`Variable` objects.

        """
        # First copy itself to avoid duplication within the graph.
        self = copy.copy(self)

        if any(x.volatile for x in inputs):  # not build graph
            # do not mix multiple volatility
            assert all(x.volatile for x in inputs)

            in_data = tuple(x.data for x in inputs)
            self._check_data_type_forward(in_data)
            with cuda.using_device(*in_data):
                out_data = self.forward(in_data)
            assert type(out_data) == tuple

            outputs = list(
                variable.Variable(y, volatile=True) for y in out_data)
            if len(outputs) == 1:
                return outputs[0]
            return outputs

        # Build graph
        # Be careful that forward references must be weak
        self.inputs = []
        for x in inputs:
            splitter = x.splitter()
            if splitter is None:
                splitter = Split(x)
                x.splitter = weakref.ref(splitter)
            self.inputs.append(splitter.add_branch())

        if self.inputs:
            self.rank = max(x.rank for x in self.inputs)
        else:
            self.rank = 0

        in_data = tuple(x.data for x in self.inputs)
        self._check_data_type_forward(in_data)
        with cuda.using_device(*in_data):
            outputs = self.forward(in_data)
        assert type(outputs) == tuple

        ret = tuple(variable.Variable(y) for y in outputs)
        for y in ret:
            y.set_creator(self)

        # Make forward references weak
        self.outputs = tuple(weakref.ref(y) for y in ret)

        if len(ret) == 1:
            return ret[0]
        return ret
Ejemplo n.º 18
0
def _sqnorm(x):
    if isinstance(x, cuda.GPUArray):
        with cuda.using_device(x):
            return float(cuda.gpuarray.dot(x, x).get())
    x = x.ravel()
    return float(x.dot(x))
Ejemplo n.º 19
0
def _sqnorm(x):
    if isinstance(x, cuda.GPUArray):
        with cuda.using_device(x):
            return float(cuda.gpuarray.dot(x, x).get())
    x = x.ravel()
    return float(x.dot(x))
Ejemplo n.º 20
0
    def __call__(self, *inputs):
        """Applies forward propagation with chaining backward references.

        Basic behavior is also expressed in documentation of :class:`Function`
        class. This function first copies itself to avoid conflict over
        multiple invokations.

        .. note::

           If the :data:`~Variable.data` attribute of input variables reside on
           GPU device, then, before it calls :meth:`forward` method, the
           appropriate device is selected, so in most cases implementor does
           not need to take care of device selection.

        Args:
            inputs: Tuple of input :class:`Variable` objects. All input
                variables must have same volatile flag.

        Returns:
            One
            :class:`Variable` object or a tuple of multiple
            :class:`Variable` objects.

        """
        # First copy itself to avoid duplication within the graph.
        self = copy.copy(self)

        if any(x.volatile for x in inputs):  # not build graph
            # do not mix multiple volatility
            assert all(x.volatile for x in inputs)

            in_data = tuple(x.data for x in inputs)
            self._check_data_type_forward(in_data)
            with cuda.using_device(*in_data):
                out_data = self.forward(in_data)
            assert type(out_data) == tuple

            outputs = list(variable.Variable(y, volatile=True)
                           for y in out_data)
            if len(outputs) == 1:
                return outputs[0]
            return outputs

        # Build graph
        # Be careful that forward references must be weak
        self.inputs = []
        for x in inputs:
            splitter = x.splitter()
            if splitter is None:
                splitter = Split(x)
                x.splitter = weakref.ref(splitter)
            self.inputs.append(splitter.add_branch())

        if self.inputs:
            self.rank = max(x.rank for x in self.inputs)
        else:
            self.rank = 0

        in_data = tuple(x.data for x in self.inputs)
        self._check_data_type_forward(in_data)
        with cuda.using_device(*in_data):
            outputs = self.forward(in_data)
        assert type(outputs) == tuple

        ret = tuple(variable.Variable(y) for y in outputs)
        for y in ret:
            y.set_creator(self)

        # Make forward references weak
        self.outputs = tuple(weakref.ref(y) for y in ret)

        if len(ret) == 1:
            return ret[0]
        return ret
Ejemplo n.º 21
0
    def backward(self, retain_grad=False):
        """Runs error backpropagation (a.k.a. backprop) from this variable.

        On backprop, :meth:`Function.backward` is called on each
        :class:`Function` object appearing in the backward graph starting from
        this variable. The backward graph is represented by backward references
        from variables to their creators, and from functions to their inputs.
        The backprop stops at all root variables. Some functions set ``None``
        as gradients of some inputs, where further backprop does not take place
        at such input variables.

        This method uses :data:`grad` as the initial error array. User can
        manually set a gradient array before calling this method. If
        :data:`data` contains only one element (i.e., it is scalar) and
        :data:`grad` is None, then this method automatically complement 1.0 as
        the initial error. This is useful on starting backprop from some scalar
        loss value.

        Args:
            retain_grad (bool): If True, the gradient arrays of all
                intermediate variables are kept. Otherwise, :data:`grad` of the
                intermediate variables are set to ``None`` on appropriate
                timing, which may reduce the maximum memory consumption.

                In most cases of training some model, the purpose of backprop
                is to compute gradients of parameters, not of variables, so it
                is recommended to set this flag False.

        """
        if self.creator is None:
            return

        cand_funcs = []
        seen_set = set()

        # Initilize error by 1, if this is a loss variable
        if self.data.size == 1 and self.grad is None:
            with cuda.using_device(self.data) as user:
                if user.is_active:
                    self.grad = cuda.ones_like(self.data)
                else:
                    self.grad = numpy.ones_like(self.data)

        def add_cand(cand):
            if cand is not None and cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator)

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            outputs = tuple(y() for y in func.outputs)  # access via weak ref

            in_data = tuple(x.data for x in func.inputs)
            out_grad = tuple(y and y.grad for y in outputs)
            with cuda.using_device(*(in_data + out_grad)):
                gxs = func.backward(in_data, out_grad)
            assert len(gxs) == len(in_data)

            if not retain_grad:
                for y in outputs:
                    if y is not None and y != self:
                        y.grad = None
            for x, gx in zip(func.inputs, gxs):
                x.grad = gx
                if gx is not None:  # skip if gradient does not flow
                    add_cand(x.creator)
Ejemplo n.º 22
0
    def backward(self, retain_grad=False):
        """Runs error backpropagation (a.k.a. backprop) from this variable.

        On backprop, :meth:`Function.backward` is called on each
        :class:`Function` object appearing in the backward graph starting from
        this variable. The backward graph is represented by backward references
        from variables to their creators, and from functions to their inputs.
        The backprop stops at all root variables. Some functions set ``None``
        as gradients of some inputs, where further backprop does not take place
        at such input variables.

        This method uses :data:`grad` as the initial error array. User can
        manually set a gradient array before calling this method. If
        :data:`data` contains only one element (i.e., it is scalar) and
        :data:`grad` is None, then this method automatically complement 1.0 as
        the initial error. This is useful on starting backprop from some scalar
        loss value.

        Args:
            retain_grad (bool): If True, the gradient arrays of all
                intermediate variables are kept. Otherwise, :data:`grad` of the
                intermediate variables are set to ``None`` on appropriate
                timing, which may reduce the maximum memory consumption.

                In most cases of training some model, the purpose of backprop
                is to compute gradients of parameters, not of variables, so it
                is recommended to set this flag False.

        """
        if self.creator is None:
            return

        cand_funcs = []
        seen_set = set()

        # Initilize error by 1, if this is a loss variable
        if self.data.size == 1 and self.grad is None:
            with cuda.using_device(self.data) as user:
                if user.is_active:
                    self.grad = cuda.ones_like(self.data)
                else:
                    self.grad = numpy.ones_like(self.data)

        def add_cand(cand):
            if cand is not None and cand not in seen_set:
                # Negate since heapq is min-heap
                heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand))
                seen_set.add(cand)

        add_cand(self.creator)

        while cand_funcs:
            _, _, func = heapq.heappop(cand_funcs)
            outputs = tuple(y() for y in func.outputs)  # access via weak ref

            in_data = tuple(x.data for x in func.inputs)
            out_grad = tuple(y and y.grad for y in outputs)
            func._check_data_type_backward(in_data, out_grad)
            with cuda.using_device(*(in_data + out_grad)):
                gxs = func.backward(in_data, out_grad)
            assert len(gxs) == len(in_data)

            if not retain_grad:
                for y in outputs:
                    if y is not None and y != self:
                        y.grad = None
            for x, gx in zip(func.inputs, gxs):
                x.grad = gx
                if gx is not None:  # skip if gradient does not flow
                    add_cand(x.creator)