def binary_cross_entropy_with_logits(input: Tensor, target: Tensor) -> Tensor: log1pexp = np.logaddexp(np.zeros_like(input.data), -input.data) data = (input.data - input.data * target.data + log1pexp).mean() depends_on = [] if not Tensor.no_grad and input.requires_grad: def grad_fn_left(grad: Array) -> Array: adjoint = (special.expit(input.data) - target.data) / np.prod( input.shape) return grad * adjoint depends_on.append(tor4.Dependancy(tensor=input, grad_fn=grad_fn_left)) if not Tensor.no_grad and target.requires_grad: def grad_fn_right(grad: Array) -> Array: adjoint = -input.data / np.prod(input.shape) return grad * adjoint depends_on.append(tor4.Dependancy(tensor=target, grad_fn=grad_fn_right)) requires_grad = not Tensor.no_grad and (input.requires_grad or target.requires_grad) return Tensor(data=data, depends_on=depends_on, requires_grad=requires_grad)
def mse_loss(input: Tensor, target: Tensor) -> Tensor: diff = input.data - target.data data = (diff**2).mean() depends_on = [] if not Tensor.no_grad and input.requires_grad: def grad_fn_left(grad: Array) -> Array: adjoint = 2 * diff / np.prod(diff.shape) return grad * adjoint depends_on.append(tor4.Dependancy(tensor=input, grad_fn=grad_fn_left)) if not Tensor.no_grad and target.requires_grad: def grad_fn_right(grad: Array) -> Array: adjoint = -2 * diff / np.prod(diff.shape) return grad * adjoint depends_on.append(tor4.Dependancy(tensor=target, grad_fn=grad_fn_right)) requires_grad = not Tensor.no_grad and (input.requires_grad or target.requires_grad) return Tensor(data=data, depends_on=depends_on, requires_grad=requires_grad)
def log_softmax(input: Tensor, dim: int) -> Tensor: data = input.data - special.logsumexp(input.data, axis=dim, keepdims=True) requires_grad = not Tensor.no_grad and input.requires_grad depends_on = [] if requires_grad: def grad_fn(grad: Array) -> Array: data_softmax = special.softmax(input.data, axis=dim) data_softmax_t = np.swapaxes(data_softmax, dim, -1) adjoint = -np.repeat(np.expand_dims(data_softmax_t, -2), data_softmax_t.shape[-1], axis=-2) bs = adjoint.shape[:-2] if data.ndim > 1 else [1] step = adjoint.shape[-1] + 1 adjoint.reshape(*bs, -1)[..., ::step] += 1 grad_t = np.swapaxes(grad, dim, -1) if data.ndim > 1: grad_t = grad_t[..., None] out = np.swapaxes(adjoint, -1, -2) @ grad_t if data.ndim > 1: out = out.squeeze(-1) return np.swapaxes(out, dim, -1) depends_on.append(tor4.Dependancy(tensor=input, grad_fn=grad_fn)) return Tensor(data=data, depends_on=depends_on, requires_grad=requires_grad)
def cross_entropy(input: Tensor, target: Tensor) -> Tensor: target_exp = np.expand_dims(target.data, 1) data = np.take_along_axis(input.data, target_exp, axis=1) - special.logsumexp( input.data, axis=1, keepdims=True) data = -data.mean() depends_on = [] if not Tensor.no_grad and input.requires_grad: def grad_fn_left(grad: Array) -> Array: adjoint = np.zeros_like(input.data) np.put_along_axis(adjoint, target_exp, -1, axis=1) adjoint += special.softmax(input.data, axis=1) adjoint /= np.prod(input.shape) / input.shape[1] return grad * adjoint depends_on.append(tor4.Dependancy(tensor=input, grad_fn=grad_fn_left)) requires_grad = not Tensor.no_grad and (input.requires_grad or target.requires_grad) return Tensor(data=data, depends_on=depends_on, requires_grad=requires_grad)
def dropout2d(input: Tensor, p: float = 0.5, training: bool = True, inplace: bool = False) -> Tensor: if training: size = input.shape[:2] + (1, ) * len(input.shape[2:]) mask = np.random.binomial(1, 1 - p, size=size) / (1 - p) data = mask * input.data else: data = input.data requires_grad = not Tensor.no_grad and input.requires_grad depends_on = [] if requires_grad: def grad_fn(grad: Array) -> Array: if training: adjoint = mask return grad * adjoint return grad depends_on.append(tor4.Dependancy(tensor=input, grad_fn=grad_fn)) return Tensor(data=data, depends_on=depends_on, requires_grad=requires_grad)
def relu(input: Tensor, inplace: bool = False) -> Tensor: mask = input.data > 0 data = mask * input.data requires_grad = not Tensor.no_grad and input.requires_grad depends_on = [] if requires_grad: def grad_fn(grad: Array) -> Array: adjoint = mask return grad * adjoint depends_on.append(tor4.Dependancy(tensor=input, grad_fn=grad_fn)) return Tensor(data=data, depends_on=depends_on, requires_grad=requires_grad)
def conv2d( input: Tensor, weight: Tensor, bias: t.Optional[Tensor] = None, stride: t.Union[int, t.Tuple[int, int]] = 1, padding: t.Union[int, t.Tuple[int, int]] = 0, dilation: t.Union[int, t.Tuple[int, int]] = 1, groups: int = 1, ) -> Tensor: stride = _pair(stride) padding = _pair(padding) dilation = _pair(dilation) assert padding == 0 or padding == (0, 0) b, c_in, h_in, w_in = input.shape c_out, c_in_o_groups, *kernel_size = weight.shape assert c_in_o_groups * groups == c_in h_out = int((h_in + 2 * padding[0] - dilation[0] * (kernel_size[0] - 1) - 1) / stride[0] + 1) w_out = int((w_in + 2 * padding[1] - dilation[1] * (kernel_size[1] - 1) - 1) / stride[1] + 1) if not (h_out > 0 and w_out > 0): raise RuntimeError(f"Output dimension is {h_out}x{w_out}") # im2col operation i = dilation[0] * np.arange(kernel_size[0]) i = np.repeat(i, kernel_size[1]) i = stride[0] * np.arange(h_out)[:, None] + i i = np.repeat(i, w_out, 0) i = np.tile(i, (1, c_in_o_groups)) j = dilation[1] * np.arange(kernel_size[1]) j = np.tile(j, kernel_size[0]) j = stride[1] * np.arange(w_out)[:, None] + j j = np.tile(j, (h_out, 1)) j = np.tile(j, (1, c_in_o_groups)) k = np.repeat(np.arange(c_in_o_groups), np.prod(kernel_size)).reshape(1, -1) n_patches = h_out * w_out input_matrix = input.data[:, k, i, j].reshape(b, n_patches, c_in_o_groups * np.prod(kernel_size)) weight_matrix = weight.data.reshape(c_out, -1).transpose(1, 0) # https://github.com/numpy/numpy/issues/8957 # data = (input_matrix @ weight_matrix).transpose(0, 2, 1).reshape(b, c_out, h_out, w_out) data = ((input_matrix.reshape(b * n_patches, -1) @ weight_matrix).reshape( b, n_patches, -1).transpose(0, 2, 1).reshape(b, c_out, h_out, w_out)) depends_on = [] if not Tensor.no_grad and input.requires_grad: def grad_fn_left(grad: Array) -> Array: """ grad: [b, c_out, h_out, w_out] """ adjoint = weight_matrix.T grad = grad.reshape(b, c_out, n_patches).swapaxes(-1, -2) N = (grad.ndim + adjoint.ndim) - input_matrix.ndim assert N % 2 == 0 N //= 2 axes = (tuple(range(-1, -N - 1, -1)), tuple(range(N))) out = np.tensordot(grad, adjoint, axes=axes) assert out.shape == (b, n_patches, c_in_o_groups * np.prod(kernel_size)) # col2im operation # https://github.com/numpy/numpy/issues/5922 # input_data = np.zeros_like(input.data) # np.add.at(input_data, (slice(None), k, i, j), out) inds = np.ravel_multi_index((k, i, j), input.shape[1:]).ravel() inds = np.tile(inds, (b, 1)) inds = c_in * h_in * w_in * np.arange(b)[:, None] + inds input_data = np.bincount(inds.ravel(), weights=out.ravel(), minlength=np.prod(input.shape)) input_data = input_data.reshape(b, c_in, h_in, w_in) return input_data depends_on.append(tor4.Dependancy(tensor=input, grad_fn=grad_fn_left)) if not Tensor.no_grad and weight.requires_grad: def grad_fn_right(grad: Array) -> Array: adjoint = input_matrix.T grad = grad.reshape(b, c_out, n_patches).swapaxes(-1, -2) N = (grad.ndim + adjoint.ndim) - weight_matrix.ndim assert N % 2 == 0 N //= 2 axes = (tuple(range(-1, -N - 1, -1)), tuple(range(N))) out = np.tensordot(adjoint, grad, axes=axes) out = out.transpose(1, 0).reshape(c_out, c_in_o_groups, *kernel_size) return out depends_on.append(tor4.Dependancy(tensor=weight, grad_fn=grad_fn_right)) requires_grad = not Tensor.no_grad and (input.requires_grad or weight.requires_grad) return Tensor(data=data, depends_on=depends_on, requires_grad=requires_grad)