def tensor_matmul(t1: Tensor, t2: Tensor) -> Tensor: """ if t1 is (n1, m1) and t2 is (m1, m2), then t1 @ t2 is (n1, m2) so grad3 is (n1, m2) if t3 = t1 @ t2, and grad3 is the gradient of some function wrt t3, then grad1 = grad3 @ t2.T grad2 = t1.T @ grad3 """ data = t1.data @ t2.data requires_grad = t1.requires_grad or t2.requires_grad depends_on: List[Dependency] = [] if t1.requires_grad: def grad_fn1(grad: np.ndarray) -> np.ndarray: return grad @ t2.data.T depends_on.append(Dependency(t1, grad_fn1)) if t2.requires_grad: def grad_fn2(grad: np.ndarray) -> np.ndarray: return t1.data.T @ grad depends_on.append(Dependency(t2, grad_fn2)) return Tensor(data, requires_grad, depends_on)
def tensor_neg(t: Tensor) -> Tensor: data = -t.data requires_grad = t.requires_grad if requires_grad: depends_on = [Dependency(t, lambda x: -x)] else: depends_on = [] return Tensor(data, requires_grad, depends_on)
def tensor_mul(t1: Tensor, t2: Tensor) -> Tensor: data = t1.data * t2.data requires_grad = t1.requires_grad or t2.requires_grad depends_on: List[Dependency] = [] if t1.requires_grad: def grad_fn1(grad: np.ndarray) -> np.ndarray: grad = grad * t2.data # Sum out added dims ndims_added = grad.ndim - t1.data.ndim for _ in range(ndims_added): grad = grad.sum(axis=0) # Sum across broadcasted (but non-added dims) for i, dim in enumerate(t1.shape): if dim == 1: grad = grad.sum(axis=i, keepdims=True) return grad depends_on.append(Dependency(t1, grad_fn1)) if t2.requires_grad: def grad_fn2(grad: np.ndarray) -> np.ndarray: grad = grad * t1.data # Sum out added dims ndims_added = grad.ndim - t2.data.ndim for _ in range(ndims_added): grad = grad.sum(axis=0) # Sum across broadcasted (but non-added dims) for i, dim in enumerate(t2.shape): if dim == 1: grad = grad.sum(axis=i, keepdims=True) return grad depends_on.append(Dependency(t2, grad_fn2)) return Tensor(data, requires_grad, depends_on)
def tensor_transpose(t: Tensor) -> Tensor: data = t.data.T required_grad = t.requires_grad if required_grad: def grad_fn(grad: np.ndarray) -> np.ndarray: return grad.T depends_on = [Dependency(t, grad_fn)] else: depends_on = [] return Tensor(data, required_grad, depends_on)
def relu(tensor: Tensor) -> Tensor: """ relu(s) = max(0., s) relu'(s) = 1 if s >= 0 else 0 """ data = np.maximum(tensor.data, 0) requires_grad = tensor.requires_grad if requires_grad: def grad_fn(grad: np.ndarray) -> np.ndarray: return grad * np.array(data >= 0.0, dtype=np.int) depends_on = [Dependency(tensor, grad_fn)] else: depends_on = [] return Tensor(data, requires_grad, depends_on)
def sigmoid(tensor: Tensor) -> Tensor: """ sigmoid(s) = 1. / (1. + exp(s)) sigmoid'(s) = sigmoid(s) * (1. - sigmoid(s)) """ data = 1. / (1. + np.exp(-tensor.data)) requires_grad = tensor.requires_grad if requires_grad: def grad_fn(grad: np.ndarray) -> np.ndarray: return grad * data * (1. - data) depends_on = [Dependency(tensor, grad_fn)] else : depends_on = [] return Tensor(data, requires_grad, depends_on)
def tanh(tensor: Tensor) -> Tensor: """ tanh(s) = (exp(s) - exp(-s)) / (exp(s) + exp(-s)) tanh'(s) = 1 - tanh(s) tanh(s) """ data = np.tanh(tensor.data) requires_grad = tensor.requires_grad if requires_grad: def grad_fn(grad: np.ndarray) -> np.ndarray: return grad * (1 - data * data) depends_on = [Dependency(tensor, grad_fn)] else: depends_on = [] return Tensor(data, requires_grad, depends_on)
def tensor_sum(t: Tensor) -> Tensor: """ calc all the sum of all the value in Tensor """ data = t.data.sum() requires_grad = t.requires_grad if requires_grad: def grad_fn(grad: np.ndarray) -> np.ndarray: return grad * np.ones_like(t.data) dependency = [Dependency(t, grad_fn=grad_fn)] else: dependency = [] return Tensor(data, requires_grad, dependency)
def tensor_slice(t: Tensor, idxs) -> Tensor: data = t.data[idxs] requires_grad = t.requires_grad if requires_grad: def grad_fn(grad: np.ndarray) -> np.ndarray: bigger_grad = np.zeros_like(data) bigger_grad[idxs] = grad return bigger_grad depends_on = [Dependency(t, grad_fn)] else: depends_on = [] return Tensor(data, requires_grad, depends_on)