def _init_params(self): init_weights = WeightInitializer(str(self.act_fn), mode=self.init) self.X = [] b = np.zeros((1, self.n_classes)) W = init_weights((self.n_classes, self.n_in)) self.parameters = {"W": W, "b": b} self.gradients = {"W": np.zeros_like(W), "b": np.zeros_like(b)} self.derived_variables = { "y_pred": [], "target": [], "true_w": [], "true_b": [], "sampled_b": [], "sampled_w": [], "out_labels": [], "target_logits": [], "noise_samples": [], "noise_logits": [], } self.is_initialized = True
def update(self, param, param_grad, param_name, cur_loss=None): """ Compute the Adam update for a given parameter. Parameters ---------- param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The value of the parameter to be updated. param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The gradient of the loss function with respect to `param_name`. param_name : str The name of the parameter. cur_loss : float The training or validation loss for the current minibatch. Used for learning rate scheduling e.g., by :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`. Default is None. Returns ------- updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The value of `param` after applying the Adam update. """ C = self.cache H = self.hyperparameters d1, d2 = H["decay1"], H["decay2"] eps, clip_norm = H["eps"], H["clip_norm"] lr = self.lr_scheduler(self.cur_step, cur_loss) if param_name not in C: C[param_name] = { "t": 0, "mean": np.zeros_like(param_grad), "var": np.zeros_like(param_grad), } # scale gradient to avoid explosion t = np.inf if clip_norm is None else clip_norm if norm(param_grad) > t: param_grad = param_grad * t / norm(param_grad) t = C[param_name]["t"] + 1 var = C[param_name]["var"] mean = C[param_name]["mean"] # update cache C[param_name]["t"] = t C[param_name]["var"] = d2 * var + (1 - d2) * param_grad**2 C[param_name]["mean"] = d1 * mean + (1 - d1) * param_grad self.cache = C # calc unbiased moment estimates and Adam update v_hat = C[param_name]["var"] / (1 - d2**t) m_hat = C[param_name]["mean"] / (1 - d1**t) update = lr * m_hat / (np.sqrt(v_hat) + eps) return param - update
def _M_step(self): C, N, X = self.C, self.N, self.X denoms = np.sum(self.Q, axis=0) # update cluster priors self.pi = denoms / N # update cluster means nums_mu = [np.dot(self.Q[:, c], X) for c in range(C)] for ix, (num, den) in enumerate(zip(nums_mu, denoms)): self.mu[ix, :] = num / den if den > 0 else np.zeros_like(num) # update cluster covariances for c in range(C): mu_c = self.mu[c, :] n_c = denoms[c] outer = np.zeros((self.d, self.d)) for i in range(N): wic = self.Q[i, c] xi = self.X[i, :] outer += wic * np.outer(xi - mu_c, xi - mu_c) outer = outer / n_c if n_c > 0 else outer self.sigma[c, :, :] = outer assert_allclose(np.sum(self.pi), 1, err_msg="{}".format(np.sum(self.pi)))
def _loss(self, X, target, neg_samples): """Actual computation of NCE loss""" fstr = "X must have shape (n_ex, n_c, n_in), but got {} dims instead" assert X.ndim == 3, fstr.format(X.ndim) W = self.parameters["W"] b = self.parameters["b"] # sample negative samples from the noise distribution if neg_samples is None: neg_samples = self.noise_sampler(self.num_negative_samples) assert len(neg_samples) == self.num_negative_samples # get the probability of the negative sample class and the target # class under the noise distribution p_neg_samples = self.noise_sampler.probs[neg_samples] p_target = np.atleast_2d(self.noise_sampler.probs[target]) # save the noise samples for debugging noise_samples = (neg_samples, p_target, p_neg_samples) # compute the logit for the negative samples and target Z_target = X @ W[target].T + b[0, target] Z_neg = X @ W[neg_samples].T + b[0, neg_samples] # subtract the log probability of each label under the noise dist if self.subtract_log_label_prob: n, m = Z_target.shape[0], Z_neg.shape[0] Z_target[range(n), ...] -= np.log(p_target) Z_neg[range(m), ...] -= np.log(p_neg_samples) # only retain the probability of the target under its associated # minibatch example aa, _, cc = Z_target.shape Z_target = Z_target[range(aa), :, range(cc)][..., None] # p_target = (n_ex, n_c, 1) # p_neg = (n_ex, n_c, n_samples) pred_p_target = self.act_fn(Z_target) pred_p_neg = self.act_fn(Z_neg) # if we're in evaluation mode, ignore the negative samples - just # return the binary cross entropy on the targets y_pred = pred_p_target if self.trainable: # (n_ex, n_c, 1 + n_samples) (target is first column) y_pred = np.concatenate((y_pred, pred_p_neg), axis=-1) n_targets = 1 y_true = np.zeros_like(y_pred) y_true[..., :n_targets] = 1 # binary cross entropy eps = 2.220446049250313e-16 np.clip(y_pred, eps, 1 - eps, y_pred) loss = -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred)) return loss, Z_target, Z_neg, y_pred, y_true, noise_samples
def flush_gradients(self): """Erase all the layer's derived variables and gradients.""" assert self.trainable, "NCELoss is frozen" self.X = [] for k, v in self.derived_variables.items(): self.derived_variables[k] = [] for k, v in self.gradients.items(): self.gradients[k] = np.zeros_like(v)
def grad2(self, x): """ Evaluate the second derivative of the leaky ReLU function on the elements of input `x`. .. math:: \\frac{\partial^2 \\text{LeakyReLU}}{\partial x_i^2} = 0 """ return np.zeros_like(x)
def __DCT2(frame): """Currently broken""" N = len(frame) # window length k = np.arange(N, dtype=float) F = k.reshape(1, -1) * k.reshape(-1, 1) K = np.divide(F, k, out=np.zeros_like(F), where=F != 0) FC = np.cos(F * np.pi / N + K * np.pi / 2 * N) return 2 * (FC @ frame)
def grad2(self, x): """ Evaluate the second derivative of the hard sigmoid activation on the elements of input `x`. .. math:: \\frac{\partial^2 \\text{HardSigmoid}}{\partial x_i^2} = 0 """ return np.zeros_like(x)
def grad2(self, x): """ Evaluate the second derivative of the SELU activation on the elements of input `x`. .. math:: \\frac{\partial^2 \\text{SELU}}{\partial x_i^2} &= 0 \\ \\ \\ \\ &&\\text{if } x_i > 0 \\\\ &= \\text{scale} \\times \\alpha e^{x_i} \\ \\ \\ \\ &&\\text{otherwise} """ return np.where(x > 0, np.zeros_like(x), np.exp(x) * self.alpha * self.scale)
def grad2(self, x): """ Evaluate the second derivative of the ELU activation on the elements of input `x`. .. math:: \\frac{\partial^2 \\text{ELU}}{\partial x_i^2} &= 0 \\ \\ \\ \\ &&\\text{if } x_i > 0 \\\\ &= \\alpha e^{x_i} \\ \\ \\ \\ &&\\text{otherwise} """ # 0 if x > 0 else alpha * e^(z) return np.where(x >= 0, np.zeros_like(x), self.alpha * np.exp(x))
def DCT(frame, orthonormal=True): """ A naive :math:`O(N^2)` implementation of the 1D discrete cosine transform-II (DCT-II). Notes ----- For a signal :math:`\mathbf{x} = [x_1, \ldots, x_N]` consisting of `N` samples, the `k` th DCT coefficient, :math:`c_k`, is .. math:: c_k = 2 \sum_{n=0}^{N-1} x_n \cos(\pi k (2 n + 1) / (2 N)) where `k` ranges from :math:`0, \ldots, N-1`. The DCT is highly similar to the DFT -- whereas in a DFT the basis functions are sinusoids, in a DCT they are restricted solely to cosines. A signal's DCT representation tends to have more of its energy concentrated in a smaller number of coefficients when compared to the DFT, and is thus commonly used for signal compression. [1] .. [1] Smoother signals can be accurately approximated using fewer DFT / DCT coefficients, resulting in a higher compression ratio. The DCT naturally yields a continuous extension at the signal boundaries due its use of even basis functions (cosine). This in turn produces a smoother extension in comparison to DFT or DCT approximations, resulting in a higher compression. Parameters ---------- frame : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` A signal frame consisting of N samples orthonormal : bool Scale to ensure the coefficient vector is orthonormal. Default is True. Returns ------- dct : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` The discrete cosine transform of the samples in `frame`. """ N = len(frame) out = np.zeros_like(frame) for k in range(N): for (n, xn) in enumerate(frame): out[k] += xn * np.cos(np.pi * k * (2 * n + 1) / (2 * N)) scale = np.sqrt(1 / (4 * N)) if k == 0 else np.sqrt(1 / (2 * N)) out[k] *= 2 * scale if orthonormal else 2 return out
def update(self, param, param_grad, param_name, cur_loss=None): """ Compute the AdaGrad update for a given parameter. Notes ----- Adjusts the learning rate of each weight based on the magnitudes of its gradients (big gradient -> small lr, small gradient -> big lr). Parameters ---------- param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The value of the parameter to be updated param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The gradient of the loss function with respect to `param_name` param_name : str The name of the parameter cur_loss : float or None The training or validation loss for the current minibatch. Used for learning rate scheduling e.g., by :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`. Default is None. Returns ------- updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The value of `param` after applying the AdaGrad update """ C = self.cache H = self.hyperparameters eps, clip_norm = H["eps"], H["clip_norm"] lr = self.lr_scheduler(self.cur_step, cur_loss) if param_name not in C: C[param_name] = np.zeros_like(param_grad) # scale gradient to avoid explosion t = np.inf if clip_norm is None else clip_norm if norm(param_grad) > t: param_grad = param_grad * t / norm(param_grad) C[param_name] += param_grad**2 update = lr * param_grad / (np.sqrt(C[param_name]) + eps) self.cache = C return param - update
def update(self, param, param_grad, param_name, cur_loss=None): """ Compute the SGD update for a given parameter Parameters ---------- param : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The value of the parameter to be updated. param_grad : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The gradient of the loss function with respect to `param_name`. param_name : str The name of the parameter. cur_loss : float The training or validation loss for the current minibatch. Used for learning rate scheduling e.g., by :class:`~numpy_ml.neural_nets.schedulers.KingScheduler`. Default is None. Returns ------- updated_params : :py:class:`ndarray <numpy.ndarray>` of shape (n, m) The value of `param` after applying the momentum update. """ C = self.cache H = self.hyperparameters momentum, clip_norm = H["momentum"], H["clip_norm"] lr = self.lr_scheduler(self.cur_step, cur_loss) if param_name not in C: C[param_name] = np.zeros_like(param_grad) # scale gradient to avoid explosion t = np.inf if clip_norm is None else clip_norm if norm(param_grad) > t: param_grad = param_grad * t / norm(param_grad) update = momentum * C[param_name] + lr * param_grad self.cache[param_name] = update return param - update
def _grad(self, X, input_idx): """Actual computation of gradient wrt. loss weights + input""" W, b = self.parameters["W"], self.parameters["b"] y_pred = self.derived_variables["y_pred"][input_idx] target = self.derived_variables["target"][input_idx] y_true = self.derived_variables["out_labels"][input_idx] Z_neg = self.derived_variables["noise_logits"][input_idx] Z_target = self.derived_variables["target_logits"][input_idx] neg_samples = self.derived_variables["noise_samples"][input_idx][0] # the number of target classes per minibatch example n_targets = 1 # calculate the grad of the binary cross entropy wrt. the network # predictions preds, classes = y_pred.flatten(), y_true.flatten() dLdp_real = ((1 - classes) / (1 - preds)) - (classes / preds) dLdp_real = dLdp_real.reshape(*y_pred.shape) # partition the gradients into target and negative sample portions dLdy_pred_target = dLdp_real[..., :n_targets] dLdy_pred_neg = dLdp_real[..., n_targets:] # compute gradients of the loss wrt the data and noise logits dLdZ_target = dLdy_pred_target * self.act_fn.grad(Z_target) dLdZ_neg = dLdy_pred_neg * self.act_fn.grad(Z_neg) # compute param gradients on target + negative samples dB_neg = dLdZ_neg.sum(axis=(0, 1)) dB_target = dLdZ_target.sum(axis=(1, 2)) dW_neg = (dLdZ_neg.transpose(0, 2, 1) @ X).sum(axis=0) dW_target = (dLdZ_target.transpose(0, 2, 1) @ X).sum(axis=1) # TODO: can this be done with np.einsum instead? dX_target = np.vstack( [dLdZ_target[[ix]] @ W[[t]] for ix, t in enumerate(target)] ) dX_neg = dLdZ_neg @ W[neg_samples] hits = list(set(target).intersection(set(neg_samples))) hit_ixs = [np.where(target == h)[0] for h in hits] # adjust param gradients if there's an accidental hit if len(hits) != 0: hit_ixs = np.concatenate(hit_ixs) target = np.delete(target, hit_ixs) dB_target = np.delete(dB_target, hit_ixs) dW_target = np.delete(dW_target, hit_ixs, 0) dX = dX_target + dX_neg # use np.add.at to ensure that repeated indices in the target (or # possibly in neg_samples if sampling is done with replacement) are # properly accounted for dB = np.zeros_like(b).flatten() np.add.at(dB, target, dB_target) np.add.at(dB, neg_samples, dB_neg) dB = dB.reshape(*b.shape) dW = np.zeros_like(W) np.add.at(dW, target, dW_target) np.add.at(dW, neg_samples, dW_neg) return dX, dW, dB