def backward(self, grad): # We need to keep the information on which axis the sum was made (to be broadcasting compatible) # We always reshape the gradient in the same axis for back-propagation tensor, = self.tensors data_keepdims = tensor.sum(axis=self.axis, keepdims=True) grad = grad.reshape(data_keepdims.shape) + nets.zeros_like(tensor) return grad
def __init__(self, parameters, lr=1e-2, beta1=0.9, beta2=0.999, epsilon=1e-8): super().__init__(parameters) self.lr = lr self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon self._cache = { 'velocity': [nets.zeros_like(p) for p in self.parameters], 'momentum': [nets.zeros_like(p) for p in self.parameters], 't': 0 }
def relu_prime(x): # type: (Array) -> Array r"""First order derivative of the ``relu`` function. .. math:: \text{relu'(x)} = \begin{cases} 1, &\quad x \ge 0 \\ 0, &\quad x < 0. \end{cases} Shape: - input: x (numpy.array): input to compute the ``leaky_relu`` function on. - output: y (numpy.array): gradient of the input, with the same shape than ``x``. .. image:: images/functional_relu_prime.png Examples:: >>> in_array = np.array([-5, 2, 6, -2, 4]) >>> out_array = relu_prime(in_array) See :class:`~nets.nn.activation.ReLU` for the activation implementation. """ return where(x >= 0, nets.ones_like(x), nets.zeros_like(x))
def backward(self, grad): tensor, = self.tensors bigger_grad = nets.zeros_like(tensor) if grad.shape != bigger_grad.shape: bigger_grad[self.indices] = grad else: bigger_grad = grad return bigger_grad
def __init__(self, parameters, lr=1e-2, decay=0.99, epsilon=1e-8): super().__init__(parameters) self.lr = lr self.decay = decay self.epsilon = epsilon self._cache = { 'velocity': [nets.zeros_like(p) for p in self.parameters] }
def backward(self, grad): tensor, = self.tensors bigger_grad = nets.zeros_like(tensor) nc = numpy_or_cupy(grad) if self.axis is None: # If there is no axis, the argmax is the location of he maximum single element max_indices = nets.unravel_index( nets.argmax(tensor), tensor.shape) bigger_grad[max_indices] = grad else: # If there is an axis, we reconstruct the bigger matrix by 'rolling' on this axis max_indices = nets.argmax(tensor, axis=self.axis) for i, roll in enumerate(nets.rollaxis(bigger_grad, self.axis)): roll += (max_indices == i).astype(int) * grad return bigger_grad
def backward(self, dout): """ Computes the backward pass of a vanilla RNN. Save gradients parameters in the ``_grads`` parameter. Args: dout (Tensor): upstream gradient. Returns: Tensor: downstream gradient """ # Initialize gradients as zero dw_ih = nets.zeros_like(self.weight_ih) dw_hh = nets.zeros_like(self.weight_hh) dw_ho = nets.zeros_like(self.weight_ho) db_h = nets.zeros_like(self.bias_h) db_o = nets.zeros_like(self.bias_o) # Get the cache hidden_states = self._cache['hidden_states'] inputs = self._cache['x'] # Keep track of hidden state derivative and loss dh_t = nets.zeros_like(hidden_states[0]) # For each element in output sequence # NB: We iterate backwards s.t. t = N, N-1, ... 1, 0 for t in reversed(range(dout.shape[0])): # Back-propagate into output sigmoid do = nets.sigmoid_prime(dout[t]) db_o += do # Back-propagate into weight_ho dw_ho += nets.dot(hidden_states[t].T, do) # Back-propagate into h_t dh = nets.dot(do, self.weight_ho.T) + dh_t # Back-propagate through non-linearity tanh df = nets.tanh_prime(hidden_states[t]) * dh db_h += df # Back-propagate into weight_ih dw_ih += nets.dot(inputs[t].T, df) # Back-propagate into weight_hh dw_hh += nets.dot(hidden_states[t - 1].T, df) dh_t = nets.dot(df, self.weight_hh.T) # TODO: dx grad # dx = nets.dot(dout, self.weight_ih) # Save gradients self._grads["weight_ih"] = dw_ih self._grads["weight_hh"] = dw_hh self._grads["weight_ho"] = dw_ho self._grads["bias_h"] = db_h self._grads["bias_o"] = db_o return None
def relu(t): r"""``relu`` is a standard activation function, defined as: .. math:: \text{relu(t)} = \max{(0, t)} Args: t (Tensor): input tensor. .. image:: /images/functional_relu.png Example: >>> import nets >>> tensor = nets.tensor([-5, 2, 6, -2, 4]) >>> tensor = relu(tensor) See :class:`~nets.nn.activation.ReLU` for the activation implementation. """ t = nets.to_tensor(t) return maximum(nets.zeros_like(t), t)
def relu(x): # type: (Array) -> Array r"""``relu`` is a standard activation function, defined as: .. math:: \text{relu(x)} = \max{(0, x)} Shape: - input: x (numpy.array): input to compute the ``relu`` function on. - output: y (numpy.array): ``relu`` output, with the same shape than ``x``. .. image:: images/functional_relu.png Examples:: >>> in_array = np.array([-5, 2, 6, -2, 4]) >>> out_array = relu(in_array) See :class:`~nets.nn.activation.ReLU` for the activation implementation. """ return maximum(nets.zeros_like(x), x)
def relu_prime(t): r"""First order derivative of the ``relu`` function. .. math:: \text{relu'(t)} = \begin{cases} 1, &\quad t \ge 0 \\ 0, &\quad t < 0. \end{cases} Args: t (Tensor): input tensor. .. image:: images/functional_relu_prime.png Example: >>> import nets >>> tensor = nets.tensor([-5, 2, 6, -2, 4]) >>> relu_prime(tensor) See :class:`~nets.nn.activation.ReLU` for the activation implementation. """ t = nets.to_tensor(t) return where(t >= 0, nets.ones_like(t), nets.zeros_like(t))
def __init__(self, parameters, lr=1e-2, momentum=0): super().__init__(parameters) self.lr = lr self.momentum = momentum self._cache = {'velocity': [nets.zeros_like(p) for p in self.parameters]}