def setup(self, params_grads): """Prepares states for all given parameter/gradient pairs. Args: params_grads: FunctionSet or tuple (pair) of two tuples. For tuple, the first element is a tuple of parameter arrays, and the second is a tuple of corresponding gradient arrays. """ if hasattr(params_grads, 'parameters') and \ hasattr(params_grads, 'gradients'): params = getattr(params_grads, 'parameters') grads = getattr(params_grads, 'gradients') elif isinstance(params_grads, tuple): params = params_grads[0] grads = params_grads[1] else: msg = ("'params_grads' must have 'parameters' and 'gradients'" " attributes or tuples, {0} is given") raise ValueError(msg) self.t = 0 self.tuples = [] for p, g in zip(params, grads): with cuda.using_device(p): state = self.init_state(p, g) self.tuples.append((p, g, state))
def update(self): """Updates all parameters and states using corresponding gradients. This method iteratively calls :meth:`update_one` for each parameter/ gradient/state tuple. Beforehand, :attr:`t` attribute is incremented. """ self.t += 1 for p, g, s in self.tuples: with cuda.using_device(p): self.update_one(p, g, s)
def zero_grads(self): """Fills all gradient arrays by zeros. This method should be call before backprop takes place, since gradients are accumulated on backprop. """ for _, g, _ in self.tuples: if isinstance(g, cuda.GPUArray): with cuda.using_device(g): g.fill(0) else: g.fill(0)
def weight_decay(self, decay): """Applies weight decay to the parameter/gradient pairs. Args: decay (float): Coefficient of weight decay """ for p, g, _ in self.tuples: if isinstance(p, cuda.GPUArray): with cuda.using_device(p): cuda.elementwise('float* g, const float* p, float decay', 'g[i] += decay * p[i]', 'weight_decay')(g, p, decay) else: g += decay * p
def setup(self, params_grads): """Prepares states for all given parameter/gradient pairs. Args: params_grads: Tuple (pair) of two tuples. The first element is a tuple of parameter arrays, and the second is a tuple of corresponding gradient arrays. Return value of :meth:`FunctionSet.collect_parameters` method can be used. """ self.t = 0 self.tuples = [] for p, g in zip(*params_grads): with cuda.using_device(p): state = self.init_state(p, g) self.tuples.append((p, g, state))
def clip_grads(self, maxnorm): """Clips the norm of whole gradients up to given threshold. Args: maxnorm (float): Threshold of gradient L2 norm. .. seealso:: :meth:`compute_grads_norm` It uses this method to compute the gradient norm to be clipped. """ norm = self.compute_grads_norm() if norm > maxnorm: ratio = maxnorm / norm for _, g, _ in self.tuples: with cuda.using_device(g): g *= ratio
def to_gpu(self, device=None): """Migrates the function to GPU and returns self. The default implementation moves all fields of type :class:`~numpy.ndarray` onto GPU. Args: device (int or :class:`pycuda.driver.Device` or ``None``): Device ID of GPU that the function will be migrated on. If this is ``None``, the current device is used. Returns: self. """ with cuda.using_device(device): for k, v in six.iteritems(self.__dict__): if isinstance(v, numpy.ndarray): setattr(self, k, cuda.to_gpu(v)) elif (isinstance(v, cuda.GPUArray) and v.gpudata.device != device): setattr(self, k, cuda.copy(v, out_device=device)) return self
def accumulate_grads(self, grads): """Accumulates gradients from other source. This method just adds given gradient arrays to gradients that this optimizer holds. It is typically used in data-parallel optimization, where gradients for different shards are computed in parallel and aggregated by this method. This method correctly treats multiple GPU devices. Args: grads (Iterable): Iterable of gradient arrays to be accumulated. """ for (_, g_dst, _), g_src in zip(self.tuples, grads): if isinstance(g_dst, numpy.ndarray): g_dst += cuda.to_cpu(g_src) continue with cuda.using_device(g_dst): if (isinstance(g_src, cuda.GPUArray) and g_dst.gpudata.device != g_src.gpudata.device): g_dst += cuda.copy(g_src, out_device=g_src.gpudata.device) else: g_dst += cuda.to_gpu(g_src)
def __call__(self, *inputs): """Applies forward propagation with chaining backward references. Basic behavior is also expressed in documentation of :class:`Function` class. This function first copies itself to avoid conflict over multiple invokations. .. note:: If the :data:`~Variable.data` attribute of input variables reside on GPU device, then, before it calls :meth:`forward` method, the appropriate device is selected, so in most cases implementor does not need to take care of device selection. Args: inputs: Tuple of input :class:`Variable` objects. All input variables must have same volatile flag. Returns: One :class:`Variable` object or a tuple of multiple :class:`Variable` objects. """ # First copy itself to avoid duplication within the graph. self = copy.copy(self) if any(x.volatile for x in inputs): # not build graph # do not mix multiple volatility assert all(x.volatile for x in inputs) in_data = tuple(x.data for x in inputs) self._check_data_type_forward(in_data) with cuda.using_device(*in_data): out_data = self.forward(in_data) assert type(out_data) == tuple outputs = list( variable.Variable(y, volatile=True) for y in out_data) if len(outputs) == 1: return outputs[0] return outputs # Build graph # Be careful that forward references must be weak self.inputs = [] for x in inputs: splitter = x.splitter() if splitter is None: splitter = Split(x) x.splitter = weakref.ref(splitter) self.inputs.append(splitter.add_branch()) if self.inputs: self.rank = max(x.rank for x in self.inputs) else: self.rank = 0 in_data = tuple(x.data for x in self.inputs) self._check_data_type_forward(in_data) with cuda.using_device(*in_data): outputs = self.forward(in_data) assert type(outputs) == tuple ret = tuple(variable.Variable(y) for y in outputs) for y in ret: y.set_creator(self) # Make forward references weak self.outputs = tuple(weakref.ref(y) for y in ret) if len(ret) == 1: return ret[0] return ret
def _sqnorm(x): if isinstance(x, cuda.GPUArray): with cuda.using_device(x): return float(cuda.gpuarray.dot(x, x).get()) x = x.ravel() return float(x.dot(x))
def __call__(self, *inputs): """Applies forward propagation with chaining backward references. Basic behavior is also expressed in documentation of :class:`Function` class. This function first copies itself to avoid conflict over multiple invokations. .. note:: If the :data:`~Variable.data` attribute of input variables reside on GPU device, then, before it calls :meth:`forward` method, the appropriate device is selected, so in most cases implementor does not need to take care of device selection. Args: inputs: Tuple of input :class:`Variable` objects. All input variables must have same volatile flag. Returns: One :class:`Variable` object or a tuple of multiple :class:`Variable` objects. """ # First copy itself to avoid duplication within the graph. self = copy.copy(self) if any(x.volatile for x in inputs): # not build graph # do not mix multiple volatility assert all(x.volatile for x in inputs) in_data = tuple(x.data for x in inputs) self._check_data_type_forward(in_data) with cuda.using_device(*in_data): out_data = self.forward(in_data) assert type(out_data) == tuple outputs = list(variable.Variable(y, volatile=True) for y in out_data) if len(outputs) == 1: return outputs[0] return outputs # Build graph # Be careful that forward references must be weak self.inputs = [] for x in inputs: splitter = x.splitter() if splitter is None: splitter = Split(x) x.splitter = weakref.ref(splitter) self.inputs.append(splitter.add_branch()) if self.inputs: self.rank = max(x.rank for x in self.inputs) else: self.rank = 0 in_data = tuple(x.data for x in self.inputs) self._check_data_type_forward(in_data) with cuda.using_device(*in_data): outputs = self.forward(in_data) assert type(outputs) == tuple ret = tuple(variable.Variable(y) for y in outputs) for y in ret: y.set_creator(self) # Make forward references weak self.outputs = tuple(weakref.ref(y) for y in ret) if len(ret) == 1: return ret[0] return ret
def backward(self, retain_grad=False): """Runs error backpropagation (a.k.a. backprop) from this variable. On backprop, :meth:`Function.backward` is called on each :class:`Function` object appearing in the backward graph starting from this variable. The backward graph is represented by backward references from variables to their creators, and from functions to their inputs. The backprop stops at all root variables. Some functions set ``None`` as gradients of some inputs, where further backprop does not take place at such input variables. This method uses :data:`grad` as the initial error array. User can manually set a gradient array before calling this method. If :data:`data` contains only one element (i.e., it is scalar) and :data:`grad` is None, then this method automatically complement 1.0 as the initial error. This is useful on starting backprop from some scalar loss value. Args: retain_grad (bool): If True, the gradient arrays of all intermediate variables are kept. Otherwise, :data:`grad` of the intermediate variables are set to ``None`` on appropriate timing, which may reduce the maximum memory consumption. In most cases of training some model, the purpose of backprop is to compute gradients of parameters, not of variables, so it is recommended to set this flag False. """ if self.creator is None: return cand_funcs = [] seen_set = set() # Initilize error by 1, if this is a loss variable if self.data.size == 1 and self.grad is None: with cuda.using_device(self.data) as user: if user.is_active: self.grad = cuda.ones_like(self.data) else: self.grad = numpy.ones_like(self.data) def add_cand(cand): if cand is not None and cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator) while cand_funcs: _, _, func = heapq.heappop(cand_funcs) outputs = tuple(y() for y in func.outputs) # access via weak ref in_data = tuple(x.data for x in func.inputs) out_grad = tuple(y and y.grad for y in outputs) with cuda.using_device(*(in_data + out_grad)): gxs = func.backward(in_data, out_grad) assert len(gxs) == len(in_data) if not retain_grad: for y in outputs: if y is not None and y != self: y.grad = None for x, gx in zip(func.inputs, gxs): x.grad = gx if gx is not None: # skip if gradient does not flow add_cand(x.creator)
def backward(self, retain_grad=False): """Runs error backpropagation (a.k.a. backprop) from this variable. On backprop, :meth:`Function.backward` is called on each :class:`Function` object appearing in the backward graph starting from this variable. The backward graph is represented by backward references from variables to their creators, and from functions to their inputs. The backprop stops at all root variables. Some functions set ``None`` as gradients of some inputs, where further backprop does not take place at such input variables. This method uses :data:`grad` as the initial error array. User can manually set a gradient array before calling this method. If :data:`data` contains only one element (i.e., it is scalar) and :data:`grad` is None, then this method automatically complement 1.0 as the initial error. This is useful on starting backprop from some scalar loss value. Args: retain_grad (bool): If True, the gradient arrays of all intermediate variables are kept. Otherwise, :data:`grad` of the intermediate variables are set to ``None`` on appropriate timing, which may reduce the maximum memory consumption. In most cases of training some model, the purpose of backprop is to compute gradients of parameters, not of variables, so it is recommended to set this flag False. """ if self.creator is None: return cand_funcs = [] seen_set = set() # Initilize error by 1, if this is a loss variable if self.data.size == 1 and self.grad is None: with cuda.using_device(self.data) as user: if user.is_active: self.grad = cuda.ones_like(self.data) else: self.grad = numpy.ones_like(self.data) def add_cand(cand): if cand is not None and cand not in seen_set: # Negate since heapq is min-heap heapq.heappush(cand_funcs, (-cand.rank, len(seen_set), cand)) seen_set.add(cand) add_cand(self.creator) while cand_funcs: _, _, func = heapq.heappop(cand_funcs) outputs = tuple(y() for y in func.outputs) # access via weak ref in_data = tuple(x.data for x in func.inputs) out_grad = tuple(y and y.grad for y in outputs) func._check_data_type_backward(in_data, out_grad) with cuda.using_device(*(in_data + out_grad)): gxs = func.backward(in_data, out_grad) assert len(gxs) == len(in_data) if not retain_grad: for y in outputs: if y is not None and y != self: y.grad = None for x, gx in zip(func.inputs, gxs): x.grad = gx if gx is not None: # skip if gradient does not flow add_cand(x.creator)