def backward(self, grad, *, graph, **kwargs): """ Back-propagates the gradient through all of the operation's inputs. Constant tensors do not propagate a gradient. grad : numpy.ndarray The back-propagated total derivative with respect to the present operation (`f`): d(out)/df graph : Set[Operation]""" for index, var in enumerate(self.variables): if not var.constant: if not var._ops: raise Exception( "Invalid Backprop: part of the computational graph containing " "this tensor was cleared prior to backprop") if var.grad is None: tmp_grad = reduce_broadcast( self.backward_var(grad, index, **kwargs), var.shape) var.grad = np.copy(tmp_grad) if np.shares_memory( tmp_grad, grad) else tmp_grad else: var.grad += reduce_broadcast( self.backward_var(grad, index, **kwargs), var.shape) for var in { i for i in self.variables if not i.constant and i.creator is not None }: var._accum_ops.add(self) var._backward(graph=graph)
def test_bad_gradient_dimensionality(shapes: Tuple[Tuple[int, ...], Tuple[int, ...]]): """ test that grad.dim < len(var_shape) raises ValueError""" var_shape = shapes[0] grad = np.empty(shapes[1]) with raises(ValueError): reduce_broadcast(grad=grad, var_shape=var_shape)
def test_reduce_broadcast_shape_consistency(shapes: hnp.BroadcastableShapes): grad = np.zeros(shapes.result_shape) assert (reduce_broadcast( grad, var_shape=shapes.input_shapes[0]).shape == shapes.input_shapes[0]) assert (reduce_broadcast( grad, var_shape=shapes.input_shapes[1]).shape == shapes.input_shapes[1])
def test_hybrid_broadcasting(grad): """ tests new-dim and keep-dim broadcasting (3, 1, 2) -> (5, 3, 4, 2)""" var_shape = (3, 1, 2) reduced = reduce_broadcast(grad=grad, var_shape=var_shape) answer = grad.sum(axis=0).sum(axis=-2, keepdims=True) assert_allclose(actual=reduced, desired=answer)
def test_reduce_broadcast_nokeepdim(var_shape, data): """ example broadcasting: (2, 3) -> (5, 2, 3)""" grad = data.draw(hnp.arrays(dtype=float, shape=broadcastable_shape( shape=var_shape, min_dim=len(var_shape) + 1, max_dim=len(var_shape) + 3), elements=st.just(1.)), label='grad') assume(1 not in grad.shape[-len(var_shape):]) reduced_grad = reduce_broadcast(grad=grad, var_shape=var_shape) reduced_grad *= np.prod( var_shape) / grad.size # scale reduced-grad so all elements are 1 assert_allclose(actual=reduced_grad, desired=np.ones(var_shape))
def test_reduce_broadcast_nokeepdim(var_shape, data): """ example broadcasting: (2, 3) -> (5, 2, 3)""" grad_shape = data.draw( broadcastable_shapes( shape=var_shape, min_dims=len(var_shape) + 1, max_dims=len(var_shape) + 3, min_side=2, ), label="grad_shape", ) grad = np.ones(grad_shape, dtype=float) reduced_grad = reduce_broadcast(grad=grad, var_shape=var_shape) reduced_grad *= (np.prod(var_shape) / grad.size ) # scale reduced-grad so all elements are 1 assert_allclose(actual=reduced_grad, desired=np.ones(var_shape))
def test_reduce_broadcast_keepdim(var_shape, data): """ example broadcasting: (2, 1, 4) -> (2, 5, 4)""" grad = data.draw(hnp.arrays(dtype=float, shape=broadcastable_shape( shape=var_shape, min_dim=len(var_shape), max_dim=len(var_shape)), elements=st.just(1.)), label='grad') reduced_grad = reduce_broadcast(grad=grad, var_shape=var_shape) assert reduced_grad.shape == tuple(i if i < j else j for i, j in zip(var_shape, grad.shape)) assert (i == 1 for i, j in zip(var_shape, grad.shape) if i < j) sum_axes = tuple(n for n, (i, j) in enumerate(zip(var_shape, grad.shape)) if i != j) assert_allclose(actual=reduced_grad, desired=grad.sum(axis=sum_axes, keepdims=True))
def backward_var(self, grad, index, **kwargs): """ example ------- fwd: "ijk, k -> ji", x, y bkwd (var: 0): "ji, k -> ijk", grad, y bkwd (var: 1): "ji, ijk -> k", grad, x """ # ijk, k in_lbls = copy(self.in_lbls) original_var_lbl = in_lbls.pop(index) var = self.variables[index] factor = self.cache[(var, original_var_lbl)] if factor == 0: # the gradient for the current tensor-label pair # has already been computed, scaled, and back-propped, # skip gradient calculation. raise SkipGradient() numpy_arrays = tuple(i.data for i in self.variables) self.cache[(var, original_var_lbl)] = 0 var_lbl = _unique_from_end(original_var_lbl) repeat_lbls = len(var_lbl) != len(original_var_lbl) if repeat_lbls: # example fwd-prop: einsum("iji -> ij", x) # "iji" becomes "ji", later we will write along # the diagonal of an array to reinstate this axis that # we just removed mapping_gen = ({k: v for k, v in zip(lbl, arr.shape)} for lbl, arr in zip(self.in_lbls, numpy_arrays)) lbl_to_size = _merge_max_mappings(*mapping_gen) var_shape = tuple(lbl_to_size[lbl] for lbl in var_lbl) else: var_shape = self.variables[index].shape # ji grad_lbl = self.out_lbls # Catch indices over which un-contracted sum was performed # for the given variable: e.g for var-0 in "ijk, jk -> k" # i is summed over without contraction with another tensor # # Backpropping through this is illegal, as it requires the creation # of an axis; e.g. k, jk -> ijk # Broadcast the gradient along all such dimensions; e.g. k -> ik # then proceed as usual; e.g. ik, jk -> ijk unique_in_lbls = set(chain.from_iterable(in_lbls)) | set(grad_lbl) if len(set(var_lbl) - unique_in_lbls) > 0: exp_dims = [slice(None) for i in range(grad.ndim)] grad_shape = list(grad.shape) for n, lbl in enumerate(var_lbl): if lbl not in unique_in_lbls: grad_lbl = grad_lbl[:n] + lbl + grad_lbl[n:] exp_dims.insert(n, np.newaxis) grad_shape.insert(n, var_shape[n]) grad = np.broadcast_to( grad if not grad.ndim else grad[tuple(exp_dims)], grad_shape) # "ji, k -> ijk" back_prop_lbls = ",".join([grad_lbl] + in_lbls) + "->" + var_lbl # (grad, y) operands = (grad, ) + numpy_arrays[:index] + numpy_arrays[index + 1:] if not repeat_lbls: # dfdx: einsum("ji, k -> ijk", grad, y) outshape = self.variables[index].shape dfdx = reduce_broadcast( np.einsum(back_prop_lbls, *operands, optimize=self.optimize), outshape) if var_shape != dfdx.shape: # if y was broadcast over x, the gradient needs to # be broadcast to x's shape: dfdx-shape (i,j,1) -> (i,j,k) dfdx = np.broadcast_to(dfdx, var_shape) if factor > 1: # This tensor-label pair appears several times as # input to einsum. Scale the gradient accordingly # such that the full contribution of the tensor-label # pair is accounted for. dfdx *= factor return dfdx # Accommodate trace by writing to strided view on array of zeros # For example: # # fwd: einsum('ijkji, k -> jk', x, y) # dfdx: einsum('jk, k -> kji', grad, y, out=view_of_x) # # writing to `view_of_x`, which is a view along the appropriate # diagonals of x, is equivalent to: # # dfdx: einsum('jk, k -> ijkji', grad, y) # # which is formally correct but not supported by einsum. dfdx = np.zeros(tuple(lbl_to_size[i] for i in original_var_lbl)) out_view_shape = tuple(lbl_to_size[i] for i in var_lbl) # compute strides required to traverse the appropriate diagonals of # the output tensor. strides = tuple( sum(dfdx.strides[ind] for ind in _get_indices(lbl, original_var_lbl)) for lbl in var_lbl) out_view = as_strided(dfdx, shape=out_view_shape, strides=strides) np.einsum(back_prop_lbls, *operands, out=out_view, optimize=self.optimize) if factor > 1: # This tensor-label pair appears several times as # input to einsum. Scale the gradient accordingly # such that the full contribution of the tensor-label # pair is accounted for. dfdx *= factor return dfdx
def test_reduce_broadcast_same_shape(grad): """ test when no broadcasting occurred""" var_shape = grad.shape reduced_grad = reduce_broadcast(grad=grad, var_shape=var_shape) assert_allclose(actual=reduced_grad, desired=grad)
def test_broadcast_scalar(grad): """ test when grad was broadcasted from a scalar""" assert_allclose(reduce_broadcast(grad, tuple()), grad.sum())
def finite_difference(f, *args, back_grad, vary_ind=None, h=Decimal(1) / Decimal(int(1e8)), as_decimal=False, kwargs=None): """ Computes numerical partial derivatives of f(x0, x1, ...) in each of its variables, using the central difference method. This is a "fast" method - it varies entire arrays at once. Thus this is only appropriate for trivial vectorized functions that map accross entries of arrays (like add or multiply). E.g. matrix multiplication is *not* suited for this style of gradient. Parameters ---------- f : Callable[[numpy.ndarray, ...], numpy.ndarray] f(x, ...) -> numpy.ndarray *args : Tuple[numpy.ndarray, ...] The input arguments to be fed to f. back_grad : numpy.ndarray The gradient being back-propagated to x and y, via f vary_ind : Optional[Tuple[int, ...]] If `None`, the partials of f with respect to all the inputs are. computed. Otherwise you can specify a sequence of the indices of the variables whose partials are to be computed 0 -> w.r.t x only, 1 -> w.r.t y only, etc. h : float, optional, (default=Decimal(1E-8)) Approximating infinitesimal. as_decimal : bool, optional (default=True) If True, f's arguments are passed as Decimal-type arrays. This improves numerical precision, but is not permitted by some functions. kwargs : Optional[Dict] Returns ------- Tuple[Union[NoneType, numpy.ndarray], ...] df/dx0, df/dx1, ... - evaluated at (`x0`, `x1`, ... ). """ def to_decimal_array(arr): """ Convert numpy ND-array to Decimal-type object array of the same shape. Used for facilitating high-precision arithmetic. Parameters ---------- arr : Union[float, numpy.ndarray] Returns ------- numpy.ndarray Decimal-type object array""" arr = np.asarray(arr) if arr.dtype.kind == "O": return arr return np.array(tuple(Decimal(float(i)) for i in arr.flat), dtype=Decimal).reshape(arr.shape) if kwargs is None: kwargs = {} if not args: raise ValueError("At least one value must be passed to `args`") h = Decimal(h) if as_decimal else float(h) two_h = Decimal(2) * h if as_decimal else 2 * h args = tuple(to_decimal_array(i) if as_decimal else i for i in args) grads = [None] * len(args) def gen_fwd_diff(i): # x1, ..., x_i + h, ..., xn return ((var if j != i else var + h) for j, var in enumerate(args)) def gen_bkwd_diff(i): # x1, ..., x_i - h, ..., xn return ((var if j != i else var - h) for j, var in enumerate(args)) for n in range(len(args)): if vary_ind is not None and n not in vary_ind: continue # central difference in variable n dvar = (f(*gen_fwd_diff(n), **kwargs) - f(*gen_bkwd_diff(n), **kwargs)) / (two_h) grads[n] = reduce_broadcast(back_grad * dvar.astype(float), args[n].shape) return grads
def numerical_gradient(f, *args, back_grad, vary_ind=None, h=1e-20, kwargs=None): """ Computes numerical partial derivatives of f(x0, x1, ...) in each of its variables, using the central difference method. This is a "fast" method - it varies entire arrays at once. Thus this is only appropriate for trivial vectorized functions that map across entries of arrays (like add or multiply). E.g. matrix multiplication is *not* suited for this style of gradient. Parameters ---------- f : Callable[[numpy.ndarray, ...], numpy.ndarray] f(x, ...) -> numpy.ndarray *args : Tuple[numpy.ndarray, ...] The input arguments to be fed to f. back_grad : numpy.ndarray The gradient being back-propagated to x and y, via f vary_ind : Optional[Tuple[int, ...]] If `None`, the partials of f with respect to all the inputs are. computed. Otherwise you can specify a sequence of the indices of the variables whose partials are to be computed 0 -> w.r.t x only, 1 -> w.r.t y only, etc. h : float, optional, (default=Decimal(1E-8)) Approximating infinitesimal. kwargs : Optional[Dict] Returns ------- Tuple[Union[NoneType, numpy.ndarray], ...] df/dx0, df/dx1, ... - evaluated at (`x0`, `x1`, ... ). """ if kwargs is None: kwargs = {} if not args: raise ValueError("At least one value must be passed to `args`") args = tuple(i.astype(np.complex128) for i in args) grads = [None] * len(args) def gen_fwd_diff(i): # x1, ..., x_i + h, ..., xn return ((var if j != i else var + h * 1j) for j, var in enumerate(args)) for n in range(len(args)): if vary_ind is not None and n not in vary_ind: continue # central difference in variable n dvar = f(*gen_fwd_diff(n), **kwargs).imag / h grads[n] = reduce_broadcast(back_grad * dvar, args[n].shape) return grads
def test_bad_gradient_dimensionality(): """ test that grad.dim < len(var_shape) raises ValueError""" var_shape = (1, 2, 3) grad = np.empty((1, 2)) with raises(ValueError): reduce_broadcast(grad=grad, var_shape=var_shape)