def huber_loss(x, y, *, delta=1): ''' Returns the Huber loss (smooth L1). Parameters ---------- outputs : mygrad.Tensor, shape=(N, any) The output for each of the N pieces of data. targets : Union[mygrad.Tensor, numpy.ndarray], shape=(N, any) The target for each datum. delta : Real > 0, optional (default=1) The value under which to use a squared error. Returns ------- mygrad.Tensor, shape=() The average Huber loss. Extended Description -------------------- The Huber loss is given by .. math:: L_\delta(x, y) = \frac{1}{N}\sum\limits_1^N \bigl\{ \begin{array}{l l} \frac{(x_i - y_i)^2}{2} & |x_i - y_i| \leq \delta\\ \delta|x_i - y_i| - \frac{\delta}{2} & |x_i - y_i| > \delta\end{array} ''' return Tensor._op(HuberLoss, x, op_args=(y, delta))
def softmax_focal_loss(x, y, *, alpha=1, gamma=0, constant=False): """ Parameters ---------- outputs : mygrad.Tensor, shape=(N, C) The C class scores for each of the N pieces of data. targets : array_like, shape=(N,) The correct class indices, in [0, C), for each datum. alpha : Real, optional (default=1) The ɑ weighting factor in the loss formulation. gamma : Real, optional (default=0) The ɣ focusing parameter. Note that for Ɣ=0 and ɑ=1, this is cross-entropy loss. constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- mygrad.Tensor, shape=(N,) The per-datum focal loss. """ return Tensor._op(SoftmaxFocalLoss, x, op_args=(y, alpha, gamma), constant=constant)
def batchnorm(x, *, gamma=None, beta=None, eps, constant=False): """ Performs batch normalization on ``x``:: y(x) = (x - E[x]) / sqrt(Var[x] + eps) batchnorm(x) = gamma * y(x) + beta Where :math:`E[x]` and :math:`Var[x]` represent the mean and variance, respectively, over axis-1 of ``x``. The subsequent affine transformation on ``y`` is optional. Parameters ---------- x : array_like, shape=(N, C, ...) The batch to be normalized within each entry of C gamma : Optional[array_like], shape=(C,) Optional per-channel scaling factors to be applied after the normalization step. beta : Optional[array_like], shape=(C,) Optional per-channel scaling bias factors to be applied after the normalization step. eps : Real A small non-negative number. constant : bool, optional (default=False) If True, the resulting Tensor is a constant. Returns ------- mygrad.Tensor The batch-normalized data. Examples -------- >>> import mygrad as mg >>> from mygrad.nnet import batchnorm >>> x = mg.Tensor([1., 4., 1.]).reshape(3, 1) >>> batchnorm(x, eps=0) Tensor([[-0.70710678], [ 1.41421356], [-0.70710678]]) """ # pass gamma and beta as empty arrays if they are not supplied if gamma is None: gamma = np.array([]) if beta is None: beta = np.array([]) return Tensor._op(BatchNorm, x, gamma, beta, op_kwargs=dict(eps=eps), constant=constant)
def focal_loss(class_probs, targets, *, alpha=1, gamma=0, constant=False): r""" Return the per-datum focal loss. Parameters ---------- class_probs : mygrad.Tensor, shape=(N, C) The C class probabilities for each of the N pieces of data. Each value is expected to lie on (0, 1] targets : Sequence[int], shape=(N,) The correct class indices, in [0, C), for each datum. alpha : Real, optional (default=1) The ɑ weighting factor in the loss formulation. gamma : Real, optional (default=0) The ɣ focusing parameter. Note that for Ɣ=0 and ɑ=1, this is cross-entropy loss. Must be a non-negative value. constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- mygrad.Tensor, shape=(N,) The per-datum focal loss. Notes ----- The formulation for the focal loss introduced in https://arxiv.org/abs/1708.02002. It is given by -ɑ(1-p)ˠlog(p). The focal loss for datum-:math:`i` is given by .. math:: -\alpha \hat{y}_i(1-p_i)^\gamma\log(p_i) where :math:`\hat{y}_i` is one in correspondence to the label associated with the datum and 0 elsewhere. That is, if the label :math:`y_k` is 2 and there are four possible label values, then :math:`\hat{y}_k = (0, 0, 1, 0)`. It is recommended in the paper that you normalize by the number of foreground samples. """ if not isinstance(gamma, Real) or gamma < 0: raise ValueError( f"`gamma` must be a non-negative number, got: {gamma}") return Tensor._op(FocalLoss, class_probs, op_args=(targets, alpha, gamma), constant=constant)
def softmax_focal_loss(scores, targets, *, alpha=1, gamma=0, constant=False): r""" Applies the softmax normalization to the input scores before computing the per-datum focal loss. Parameters ---------- scores : mygrad.Tensor, shape=(N, C) The C class scores for each of the N pieces of data. targets : array_like, shape=(N,) The correct class indices, in [0, C), for each datum. alpha : Real, optional (default=1) The ɑ weighting factor in the loss formulation. gamma : Real, optional (default=0) The ɣ focusing parameter. Note that for Ɣ=0 and ɑ=1, this is cross-entropy loss. constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- mygrad.Tensor, shape=(N,) The per-datum focal loss. Notes ----- The formulation for the focal loss introduced in https://arxiv.org/abs/1708.02002. It is given by -ɑ(1-p)ˠlog(p). The focal loss for datum-:math:`i` is given by .. math:: -\alpha \hat{y}_i(1-p_i)^\gamma\log(p_i) where :math:`\hat{y}_i` is one in correspondence to the label associated with the datum and 0 elsewhere. That is, if the label :math:`y_k` is 2 and there are four possible label values, then :math:`\hat{y}_k = (0, 0, 1, 0)`. It is recommended in the paper that you normalize by the number of foreground samples. """ return Tensor._op(SoftmaxFocalLoss, scores, op_args=(targets, alpha, gamma), constant=constant)
def elu(x, alpha, constant=False): """ Returns the exponential linear activation (ELU) elementwise along x. The ELU is given by `ɑ(exp(x) - 1) for x < 0 and x for x ≥ 0`. Parameters ---------- x : mygrad.Tensor Input data. alpha : Real The multiplicative factor on the negative activation. constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- mygrad.Tensor The ELU function applied to `x` elementwise. Examples -------- >>> import mygrad as mg >>> from mygrad.nnet.activations import elu >>> x = mg.arange(-5, 6) >>> x Tensor([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]) >>> y = elu(x, alpha=0.1); y Tensor([-0.09932621, -0.09816844, -0.09502129, -0.08646647, -0.06321206, 0. , 1. , 2. , 3. , 4. , 5. ]) >>> y.backward() >>> x.grad array([6.73794700e-04, 1.83156389e-03, 4.97870684e-03, 1.35335283e-02, 3.67879441e-02, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00]) """ if isinstance(alpha, (np.ndarray, Tensor)): alpha = alpha.item() if not isinstance(alpha, Real): raise TypeError( f"`alpha` must be a real-valued scalar, got {alpha} (type {type(alpha)})" ) return Tensor._op(ELU, x, op_args=(alpha,), constant=constant)
def elu(x, alpha): ''' Returns the exponential linear activation (ELU) elementwise along x. The ELU is given by ɑ(exp(x) - 1) for x < 0 and x for x ≥ 0. Parameters ---------- x : mygrad.Tensor Input data. alpha : Real The multiplicative factor on the negative activation. Returns ------- mygrad.Tensor The ELU function applied to `x` elementwise. ''' return Tensor._op(ELU, x, op_args=(alpha,))
def softmax_focal_loss(x, y, *, alpha=1, gamma=0): """ Parameters ---------- outputs : mygrad.Tensor, shape=(N, C) The C class scores for each of the N pieces of data. targets : Sequence[int], shape=(N,) The correct class indices, in [0, C), for each datum. alpha : Real, optional (default=1) The ɑ weighting factor in the loss formulation. gamma : Real, optional (default=0) The ɣ focusing parameter. Note that for Ɣ=0 and ɑ=1, this is cross-entropy loss. Returns ------- mygrad.Tensor The average focal loss. """ return Tensor._op(SoftmaxFocalLoss, x, op_args=(y, alpha, gamma))
def selu(x): ''' Returns the scaled exponential linear activation (SELU) elementwise along x. The SELU is given by λɑ(exp(x) - 1) for x < 0 and λx for x ≥ 0. Parameters ---------- x : mygrad.Tensor Input data. Returns ------- mygrad.Tensor The SELU function applied to `x` elementwise. Notes ----- The SELU activation was proposed in the paper Self-Normalizing Neural Networks Günter Klambauer, Thomas Unterthiner, Andreas Mayr, Sepp Hochreiter at https://arxiv.org/abs/1706.02515 ''' return Tensor._op(SELU, x)
def selu(x, constant=False): """ Returns the scaled exponential linear activation (SELU) elementwise along x. The SELU is given by λɑ(exp(x) - 1) for x < 0 and λx for x ≥ 0. Parameters ---------- x : mygrad.Tensor Input data. constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- mygrad.Tensor The SELU function applied to `x` elementwise. Notes ----- The SELU activation was proposed in the paper Self-Normalizing Neural Networks Günter Klambauer, Thomas Unterthiner, Andreas Mayr, Sepp Hochreiter at https://arxiv.org/abs/1706.02515 Examples -------- >>> import mygrad as mg >>> from mygrad.nnet.activations import selu >>> x = mg.arange(-5, 6) >>> x Tensor([-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5]) >>> y = elu(x, alpha=0.1); y Tensor([-1.74625336, -1.72589863, -1.67056873, -1.52016647, -1.11133074, 0. , 1.05070099, 2.10140197, 3.15210296, 4.20280395, 5.25350494]) """ return Tensor._op(SELU, x, constant=constant)
def batchnorm(x, *, gamma=None, beta=None, eps, constant=False): """ Performs batch normalization on ``x`` y(x) = (x - E[x]) / sqrt(Var[x] + eps) batchnorm(x) = gamma * y(x) + beta Where E[x] and Var[x] represent the mean and variance, respectively, over axis-1 of ``x``. The subsequent affine transformation on ``y`` is optional. Parameters ---------- x : array_like, shape=(N, C, ...) The batch to be normalized within each entry of C gamma : Optional[array_like], shape=(C,) beta : Optional[array_like], shape=(C,) eps : Real A small non-negative number. constant constant : bool, optional (default=False) If True, the resulting Tensor is a constant. Returns ------- mygrad.Tensor The batch-normalized data. """ # pass gamma and beta as empty arrays if they are not supplied if gamma is None: gamma = np.array([]) if beta is None: beta = np.array([]) return Tensor._op(BatchNorm, x, gamma, beta, op_kwargs=dict(eps=eps), constant=constant)
def einsum(*operands, optimize=False, constant=False): r""" einsum(subscripts, *operands) Evaluates the Einstein summation convention on the operands. This implementation exactly mirrors that of ``numpy.einsum`` and supports back-propagation through all variety of tensor-products, sums, traces, and views that it can perform. The following docstring was adapted from the documentation for ``numpy.einsum`` Using the Einstein summation convention, many common multi-dimensional array operations can be represented in a simple fashion. This function provides a way to compute such summations. The best way to understand this function is to try the examples below, which show how many common NumPy/MyGrad functions can be implemented as calls to ``einsum``. Back-propagation via ``einsum`` is optimized such that any tensor that occurs redundantly within the summation will only have its gradient computed once. This optimization accommodates all number and combination of redundancies that can be encountered. E.g. back-propping through ``einsum('...,...->', x, x)`` will only incur a single computation/accumulation for ``x.grad`` rather than two. This permits users to leverage the efficiency of sum-reduction, where ``(x ** 2).sum()`` is sub-optimal, without being penalized during back-propagation. Parameters ---------- subscripts : str Specifies the subscripts for summation. operands : array_like The tensors used in the summation. optimize : {False, True, 'greedy', 'optimal'}, optional (default=False) Controls if intermediate optimization should occur; also enables the use of BLAS where possible. This can produce significant speedups for computations like matrix multiplication. No optimization will occur if False and True will default to the 'greedy' algorithm. Also accepts an explicit contraction list from the ``np.einsum_path`` function. See ``np.einsum_path`` for more details. constant : bool, optional (default=False) If True, the resulting Tensor is a constant. Returns ------- output : mygrad.Tensor The calculation based on the Einstein summation convention. Notes ----- The subscripts string is a comma-separated list of subscript labels, where each label refers to a dimension of the corresponding operand. Repeated subscripts labels in one operand take the diagonal. For example, ``einsum('ii', a)`` is equivalent to ``np.trace(a)`` (however, the former supports back-propagation). Whenever a label is repeated, it is summed, so ``einsum('i, i', a, b)`` is equivalent to ``np.inner(a, b)``. If a label appears only once, it is not summed, so ``einsum('i', a)`` produces a view of ``a`` with no changes. The order of labels in the output is by default alphabetical. This means that ``np.einsum('ij', a)`` doesn't affect a 2D tensor, while ``einsum('ji', a)`` takes its transpose. The output can be controlled by specifying output subscript labels as well. This specifies the label order, and allows summing to be disallowed or forced when desired. The call ``einsum('i->', a)`` is like ``np.sum(a, axis=-1)``, and ``einsum('ii->i', a)`` is like ``np.diag(a)``. The difference is that `einsum` does not allow broadcasting by default. To enable and control broadcasting, use an ellipsis. Default NumPy-style broadcasting is done by adding an ellipsis to the left of each term, like ``einsum('...ii->...i', a)``. To take the trace along the first and last axes, you can do ``einsum('i...i', a)``, or to do a matrix-matrix product with the left-most indices instead of rightmost, you can do ``einsum('ij...,jk...->ik...', a, b)``. When there is only one operand, no axes are summed, and no output parameter is provided, a view into the operand is returned instead of a new tensor. Thus, taking the diagonal as ``einsum('ii->i', a)`` produces a view. An alternative way to provide the subscripts and operands is as ``einsum(op0, sublist0, op1, sublist1, ..., [sublistout])``. The examples below have corresponding `einsum` calls with the two parameter methods. Examples -------- >>> import mygrad as mg >>> import numpy as np >>> a = mg.arange(25).reshape(5,5) >>> b = mg.arange(5) >>> c = mg.arange(6).reshape(2,3) Compute the trace of ``a``, :math:`\sum_{i}{A_{ii}} = f`: >>> einsum('ii', a) Tensor(60) >>> einsum(a, [0, 0]) Tensor(60) >>> np.trace(a.data) array(60) Return a view along the diagonal of ``a``, :math:`A_{ii} = F_{i}`: >>> einsum('ii->i', a) Tensor([ 0, 6, 12, 18, 24]) >>> einsum(a, [0,0], [0]) Tensor([ 0, 6, 12, 18, 24]) >>> np.diag(a.data) array([ 0, 6, 12, 18, 24]) Compute the matrix-vector product of ``a`` with ``b``, :math:`\sum_{j}{A_{ij} B_{j}} = F_{i}`: >>> einsum('ij,j', a, b) Tensor([ 30, 80, 130, 180, 230]) >>> einsum(a, [0,1], b, [1]) Tensor([ 30, 80, 130, 180, 230]) >>> mg.matmul(a, b) Tensor([ 30, 80, 130, 180, 230]) >>> einsum('...j,j', a, b) Tensor([ 30, 80, 130, 180, 230]) Take the transpose of ``c``, :math:`C_{ji} = F_{ij}`: >>> einsum('ji', c) Tensor([[0, 3], [1, 4], [2, 5]]) >>> einsum(c, [1, 0]) Tensor([[0, 3], [1, 4], [2, 5]]) >>> c.T Tensor([[0, 3], [1, 4], [2, 5]]) Compute ``3 * c``: >>> einsum('..., ...', 3, c) Tensor([[ 0, 3, 6], [ 9, 12, 15]]) >>> einsum(',ij', 3, c) Tensor([[ 0, 3, 6], [ 9, 12, 15]]) >>> einsum(3, [Ellipsis], c, [Ellipsis]) Tensor([[ 0, 3, 6], [ 9, 12, 15]]) >>> 3 * c Tensor([[ 0, 3, 6], [ 9, 12, 15]]) Compute the inner product of ``b`` with itself, :math:`\sum_{i}{B_{i} B_{i}} = f`: >>> einsum('i,i', b, b) Tensor(30) >>> einsum(b, [0], b, [0]) Tensor(30) >>> np.inner(b.data, b.data) 30 Compute the outer product of ``array([1, 2])`` with ``b``, :math:`A_{i}B_{j} = F_{ij}`: >>> einsum('i,j', np.arange(2)+1, b) Tensor([[0, 1, 2, 3, 4], [0, 2, 4, 6, 8]]) >>> einsum(np.arange(2)+1, [0], b, [1]) Tensor([[0, 1, 2, 3, 4], [0, 2, 4, 6, 8]]) >>> np.outer(np.arange(2)+1, b) array([[0, 1, 2, 3, 4], [0, 2, 4, 6, 8]]) >>> einsum('i...->...', a) Tensor([50, 55, 60, 65, 70]) >>> einsum(a, [0,Ellipsis], [Ellipsis]) Tensor([50, 55, 60, 65, 70]) >>> np.sum(a, axis=0) array([50, 55, 60, 65, 70]) Compute the tensor product :math:`\sum_{ij}{A_{ijk} B_{jil}} = F_{kl}` >>> a = mg.arange(60.).reshape(3,4,5) >>> b = mg.arange(24.).reshape(4,3,2) >>> einsum('ijk,jil->kl', a, b) Tensor([[ 4400., 4730.], [ 4532., 4874.], [ 4664., 5018.], [ 4796., 5162.], [ 4928., 5306.]]) >>> einsum(a, [0,1,2], b, [1,0,3], [2,3]) Tensor([[ 4400., 4730.], [ 4532., 4874.], [ 4664., 5018.], [ 4796., 5162.], [ 4928., 5306.]]) >>> np.tensordot(a,b, axes=([1,0],[0,1])) array([[ 4400., 4730.], [ 4532., 4874.], [ 4664., 5018.], [ 4796., 5162.], [ 4928., 5306.]]) Matrix multiply ``a.T`` with ``b.T``, :math:`\sum_{k}{A_{ki} B_{jk}} = F_{ij}` >>> a = mg.arange(6).reshape((3,2)) >>> b = mg.arange(12).reshape((4,3)) >>> einsum('ki,jk->ij', a, b) Tensor([[10, 28, 46, 64], [13, 40, 67, 94]]) >>> einsum('ki,...k->i...', a, b) Tensor([[10, 28, 46, 64], [13, 40, 67, 94]]) >>> einsum('k...,jk', a, b) Tensor([[10, 28, 46, 64], [13, 40, 67, 94]]) Make an assignment to a view along the diagonal of ``a``: >>> a = mg.zeros((3, 3)) >>> einsum('ii->i', a).data[:] = 1 >>> a Tensor([[ 1., 0., 0.], [ 0., 1., 0.], [ 0., 0., 1.]]) """ # TODO: normalize error handling for invalid inputs operands = list(operands) if isinstance(operands[0], str): # operands form: "ijk, ijk", x, y variables = operands[1:] if any(isinstance(i, Tensor) for i in operands): operands[1:] = ( var.data if isinstance(var, Tensor) else var for var in operands[1:] ) else: # operands form: op0, sublist0, op1, sublist1, ..., [sublistout] end = -1 if len(operands) % 2 else None # -1 if sublistout is included variables = operands[:end:2] if any(isinstance(i, Tensor) for i in operands): operands[:end:2] = ( var.data if isinstance(var, Tensor) else var for var in operands[:end:2] ) in_lbls, out_lbls, _ = _parse_einsum_input(operands) return Tensor._op( EinSum, *variables, op_kwargs=dict(in_lbls=in_lbls, out_lbls=out_lbls, optimize=optimize), constant=constant )
def matmul(a, b, constant=False): r""" Matrix product of two tensors: ``matmul(x, y)`` is equivalent to ``x @ y``. This documentation was adapted from ``numpy.matmul`` The behavior depends on the arguments in the following way. - If both arguments are 2-D they are multiplied like conventional matrices. - If either argument is N-D, N > 2, it is treated as a stack of matrices residing in the last two indexes and broadcast accordingly. - If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions. After matrix multiplication the prepended 1 is removed. - If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions. After matrix multiplication the appended 1 is removed. Multiplication by a scalar is not allowed, use ``*`` instead. Note that multiplying a stack of matrices with a vector will result in a stack of vectors, but matmul will not recognize it as such. ``matmul`` differs from ``numpy.dot`` in two important ways. - Multiplication by scalars is not allowed. - Stacks of matrices are broadcast together as if the matrices were elements. Parameters ---------- a : array_like b : array_like constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- output : mygrad.Tensor Returns the matrix product of `a` and `b`. If `a` and `b` are both 1-D arrays then a scalar is returned; otherwise an array is returned. Raises ------ ValueError If the last dimension of `a` is not the same size as the second-to-last dimension of `b`. If scalar value is passed. Notes ----- The matmul function implements the semantics of the `@` operator introduced in Python 3.5 following PEP465. Examples -------- For two 2D tensors, ``matmul(a, b)`` is the matrix product :math:`\sum_{j}{A_{ij} B_{jk}} = F_{ik}`: >>> import mygrad as mg >>> a = [[1, 0], [0, 1]] >>> b = [[4, 1], [2, 2]] >>> mg.matmul(a, b) Tensor([[4, 1], [2, 2]]) For 2-D mixed with 1-D, the result is the matrix-vector product, :math:`\sum_{j}{A_{ij} B_{j}} = F_{i}`: >>> a = [[1, 0], [0, 1]] >>> b = [1, 2] >>> mg.matmul(a, b) Tensor([1, 2]) Broadcasting is conventional for stacks of arrays. Here ``a`` is treated like a stack of three 5x6 matrices, and the 6x4 matrix ``b`` is broadcast matrix-multiplied against each one. This produces a shape-(3, 5, 4) tensor as a result. >>> a = mg.arange(3*5*6).reshape((3,5,6)) >>> b = mg.arange(6*4).reshape((6,4)) >>> mg.matmul(a,b).shape (3, 5, 4) Scalar multiplication raises an error. >>> mg.matmul(a, 3) Traceback (most recent call last): ... ValueError: Scalar operands are not allowed, use '*' instead""" return Tensor._op(MatMul, a, b, constant=constant)
def old_op(a): return Tensor._op(OldOperation, a)
def where(condition, x=not_set, y=not_set, constant=False): """ where(condition, [x, y]) Return elements chosen from `x` or `y` depending on `condition`. .. note:: When only ``condition`` is provided, this function is a shorthand for ``np.asarray(condition).nonzero()``. The rest of this documentation covers only the case where all three arguments are provided. This docstring was adapted from that of ``numpy.where``. Parameters ---------- condition : array_like, bool Where True, yield `x`, otherwise yield ``y``. ``x``, ``y`` and `condition` need to be broadcastable to some shape. x : array_like Values from which to chosen where ``condition`` is ``True``. y : array_like Values from which to chosen where ``condition`` is ``False``. constant : bool, optional(default=False) If ``True``, the returned tensor is a constant (it does not back-propagate a gradient) Returns ------- out : mygrad.Tensor A tensor with elements from `x` where `condition` is True, and elements from `y` elsewhere. Examples -------- >>> import mygrad as mg >>> a = mg.arange(10) >>> a Tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) >>> mg.where(a < 5, a, 10*a) Tensor([ 0, 1, 2, 3, 4, 50, 60, 70, 80, 90]) This can be used on multidimensional tensors too: >>> mg.where([[True, False], [True, True]], ... [[1, 2], [3, 4]], ... [[9, 8], [7, 6]]) Tensor([[1, 8], [3, 4]]) The shapes of x, y, and the condition are broadcast together: >>> x, y = np.ogrid[:3, :4] >>> mg.where(x < y, x, 10 + y) # both x and 10+y are broadcast Tensor([[10, 0, 0, 0], [10, 11, 1, 1], [10, 11, 12, 2]]) >>> a = mg.Tensor([[0, 1, 2], ... [0, 2, 4], ... [0, 3, 6]]) >>> mg.where(a < 4, a, -1) # -1 is broadcast Tensor([[ 0, 1, 2], [ 0, 2, -1], [ 0, 3, -1]]) """ if x is not_set and y is not_set: return np.where(asarray(condition)) if x is not_set or y is not_set: raise ValueError("either both or neither of x and y should be given") return Tensor._op( Where, x, y, op_kwargs=dict(condition=condition), constant=constant )