def svm_loss(x, y): """ Computes the loss and gradient using for multiclass SVM classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C Returns a tuple of: - loss: Scalar giving the loss - dx: Gradient of the loss with respect to x """ N = x.shape[0] correct_class_scores = x[np.arange(N), y] #TODO: Support broadcast case: (X,) (X, Y) #shape(x) is (d0, d1) #shape(correct_class_scores) is (d0,) #margins = np.maximum(0, x - correct_class_scores + 1.0) margins = np.transpose(np.maximum(0, np.transpose(x) - np.transpose(correct_class_scores) + 1.0)) loss = (np.sum(margins) - np.sum(margins[np.arange(N), y])) / N return loss
def softmax_loss(x, y): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C Returns a tuple of: - loss: Scalar giving the loss - dx: Gradient of the loss with respect to x """ #np.expand_dims(correct_class_scores, axis = 1) #probs = np.exp(x - np.max(x, axis=1, keepdims=True)) #print "x.shape", x.shape #Somehow Buggy. Max doesn't work. probs = np.exp(x - np.max(x, axis=1)) #probs /= np.expand_dims(np.sum(probs, axis=1), axis = 1) probs /= np.expand_dims(np.sum(probs, axis=1), axis=1) N = x.shape[0] loss = -np.sum(np.log(probs[np.arange(N), y])) / N dx = probs.copy() dx[np.arange(N), y] -= 1 dx /= N return loss, dx
def softmax_loss(x, y): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C Returns a tuple of: - loss: Scalar giving the loss - dx: Gradient of the loss with respect to x """ #np.expand_dims(correct_class_scores, axis = 1) #probs = np.exp(x - np.max(x, axis=1, keepdims=True)) #print "x.shape", x.shape #Somehow Buggy. Max doesn't work. probs = np.exp(x - np.max(x, axis=1)) #probs /= np.expand_dims(np.sum(probs, axis=1), axis = 1) probs /= np.expand_dims(np.sum(probs, axis=1), axis = 1) N = x.shape[0] loss = -np.sum(np.log(probs[np.arange(N), y])) / N dx = probs.copy() dx[np.arange(N), y] -= 1 dx /= N return loss, dx
def svm_loss(x, y): """ Computes the loss and gradient using for multiclass SVM classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C Returns a tuple of: - loss: Scalar giving the loss - dx: Gradient of the loss with respect to x """ N = x.shape[0] correct_class_scores = x[np.arange(N), y] #TODO: Support broadcast case: (X,) (X, Y) #shape(x) is (d0, d1) #shape(correct_class_scores) is (d0,) #margins = np.maximum(0, x - correct_class_scores + 1.0) margins = np.transpose( np.maximum(0, np.transpose(x) - np.transpose(correct_class_scores) + 1.0)) loss = (np.sum(margins) - np.sum(margins[np.arange(N), y])) / N return loss
def softmax_loss(x, y): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Either of the followings: - One hot encoding of labels, of shape (N, C) - Label index of shape (N, ), each y[i] is the label of i^th example (0 <= y[i] < C) Returns a tuple of: - loss: Scalar giving the loss """ N = x.shape[0] C = x.shape[1] if len(y.shape) == 1: #convert it to one hot encoding onehot_y = np.zeros([N, C]) np.onehot_encode(y, onehot_y) else: onehot_y = y probs = x - np.max(x, axis=1, keepdims=True) loss = -np.sum(probs * onehot_y) / N loss += np.sum(np.log(np.sum(np.exp(probs), axis=1, keepdims=True))) / N return loss
def quick_grad_check(fun, arg0, extra_args=(), kwargs={}, verbose=True, eps=EPS, rtol=RTOL, atol=ATOL, rs=None): """Checks the gradient of a function (w.r.t. to its first arg) in a random direction""" if verbose: print("Checking gradient of {0} at {1}".format(fun, arg0)) if rs is None: rs = nnp.random.RandomState() random_dir = rs.standard_normal(nnp.shape(arg0)) random_dir = random_dir / nnp.sqrt(nnp.sum(random_dir * random_dir)) if not extra_args == (): unary_fun = lambda x : fun(arg0 + x * random_dir, extra_args) numeric_grad = (unary_fun(eps/2) - unary_fun(-eps/2)) / eps analytic_grad = np.sum(grad(fun)(arg0, extra_args) * random_dir) else: unary_fun = lambda x : fun(arg0 + x * random_dir) numeric_grad = (unary_fun(eps/2) - unary_fun(-eps/2)) / eps analytic_grad = np.sum(grad(fun)(arg0) * random_dir) if isinstance(numeric_grad, minpy.array.Number): assert abs((analytic_grad - numeric_grad).get_data(None)) < atol and abs((analytic_grad - numeric_grad).get_data(None)) < abs((analytic_grad * rtol).get_data(None)), \ "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad) elif isinstance(numeric_grad, minpy.array.Array): assert nnp.prod(nnp.shape(analytic_grad.asnumpy())[:]) == 1, "Currently only support check loss" assert abs((analytic_grad - numeric_grad).asnumpy()) < atol and abs((analytic_grad - numeric_grad).asnumpy()) < abs((analytic_grad * rtol).asnumpy()), \ "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad) else: assert False if verbose: print("Gradient projection OK (numeric grad: {0}, analytic grad: {1})".format( numeric_grad, analytic_grad))
def svm_loss(x, y, mode): """ Computes the loss and gradient using for multiclass SVM classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C Returns a tuple of: - loss: Scalar giving the loss - dx: Gradient of the loss with respect to x """ if mode == 'cpu': np.set_policy(policy.OnlyNumpyPolicy()) else: np.set_policy(policy.PreferMXNetPolicy()) N = x.shape[0] correct_class_scores = x[np.arange(N), y] #margins = np.maximum(0, x - correct_class_scores[:, np.newaxis] + 1.0) margins = np.maximum(0, x - np.expand_dims(correct_class_scores, axis = 1) + 1.0) margins[np.arange(N), y] = 0 loss = np.sum(margins) / N num_pos = np.sum(margins > 0, axis=1) dx = np.zeros_like(x) dx[margins > 0] = 1 dx[np.arange(N), y] -= num_pos dx /= N return loss, dx
def loss(self, ps, as_, vs, rs, advs): ps = np.maximum(1.0e-5, np.minimum(1.0 - 1e-5, ps)) policy_grad_loss = -np.sum(np.log(ps) * as_ * advs) vf_loss = 0.5*np.sum((vs - rs)**2) entropy = -np.sum(ps*np.log(ps)) loss_ = policy_grad_loss + self.config.vf_wt*vf_loss - self.config.entropy_wt*entropy return loss_
def _softmax_loss(self, X, y, *args): N = X.shape[0] scores = self._forward(X, *args) scores = np.exp(scores - np.max(scores, axis=1, keepdims=True)) prob = scores / np.sum(scores, axis=1, keepdims=True) loss = np.sum(-np.log(prob[np.arange(N), y])) / float(N) return loss
def softmax(x, y): import numpy as np y = y.astype(int) probs = np.exp(x - np.max(x, axis=1, keepdims=True)) probs /= np.sum(probs, axis=1, keepdims=True) N = x.shape[0] loss = -np.sum(np.log(probs[np.arange(N), y])) / N return loss
def layer_normalization(X, gamma, beta, epsilon=0.001): N, D = X.shape mean = np.sum(X, axis=1).reshape((N, 1)) / float(D) X = X - mean variance = np.sum(X**2, axis=1).reshape((N, 1)) / float(D) std = variance**0.5 X = X / (std + epsilon) return X * gamma + beta
def softmax_crossentropy(x, y): # x should be (batch, prob) # y should be (batch, ) x_dev = x - np.max(x, axis=1, keepdims=True) # minpy doesn't support x.max() sm = x_dev - np.log(np.sum(np.exp(x_dev), axis=1, keepdims=True)) ids = np.arange(0, y.shape[0])*x.shape[1] + y ce = -np.sum(sm.reshape((sm.shape[0]*sm.shape[1],))[ids])/(1.0*y.shape[0]) # minpy doesn't support -1 in shape inference return ce
def softmax_probability(p, channel): N, C = p.shape p -= np.max(p, axis=1).reshape((N, 1)) code = np.zeros((N, C)) np.onehot_encode(channel, code) p = np.exp(p) selected_p = p * code total_p = np.sum(p, axis=1).reshape((N, 1)) return np.sum(selected_p / total_p, axis=1)
def _loss_function(*args): normal_loss = model.loss(model.forward(X, 'train'), Y) noisy_output = model.forward(noisy_X, 'train') noisy_output -= np.max(noisy_output, axis=1).reshape((K, 1)) noisy_output = np.exp(noisy_output) model_p_noisy_X = noisy_output / np.sum(noisy_output, axis=1).reshape( (K, 1)) kl = KL(1.0 / N_CLASSES, model_p_noisy_X) noisy_loss = gamma * np.sum(kl) / float(K) return gamma * normal_loss + (1 - gamma) * noisy_loss
def train_loss(X, y, W1, W2, b1, b2): l1 = affine_relu_forward(X, W1, b1) l2 = affine_forward(l1, W2, b2) scores = l2 if y: #[TODO]: softmax is not supported yet # loss, d_scores = softmax_loss(scores, y) loss = svm_loss(scores, y) loss_with_reg = loss + np.sum(W1**2) * 0.5 * self.reg + np.sum( W2**2) * 0.5 * self.reg return loss_with_reg
def train_loss(X, y, W1, W2, b1, b2): l1, l1_cache = affine_relu_forward(X, W1, b1) l2, l2_cache = affine_forward(l1, W2, b2) scores = l2 if y is None: return scores loss, d_scores = softmax_loss(scores, y) loss += np.sum(W1 ** 2) * 0.5 * self.reg loss += np.sum(W2 ** 2) * 0.5 * self.reg return loss
def train_loss(X, y, W1, W2, b1, b2): l1 = affine_relu_forward(X, W1, b1) l2 = affine_forward(l1, W2, b2) scores = l2 if y is None: return scores #[TODO]: softmax is not supported yet # loss, d_scores = softmax_loss(scores, y) loss = svm_loss(scores, y) loss_with_reg = loss + np.sum(W1**2) * 0.5 * self.reg + np.sum( W2**2) * 0.5 * self.reg return loss_with_reg
def train_loss(*args): inputs = args[0] softmax_label = args[1] probs = self.symbol_func(**self.make_mxnet_weight_dict(inputs, softmax_label, args[self.data_target_cnt:len(args)])) if softmax_label is None: return probs samples_num = X.shape[0] targets = np.zeros((samples_num, self.num_classes)) targets[np.arange(samples_num), softmax_label] = 1 loss = -np.sum(targets * np.log(probs)) / samples_num for i in self.get_index_reg_weight(): loss = loss + np.sum(0.5*args[i]**2*self.reg) return loss
def check_accuracy(self, dataiter, num_samples=None): """ Check accuracy of the model on the provided data. Inputs: - dataiter: data iterator that can produce batches. - num_samples: If not None and dataiter has more than num_samples datapoints, subsample the data and only test the model on num_samples datapoints. Returns: - acc: Scalar giving the fraction of instances that were correctly classified by the model. """ # Maybe subsample the data N = dataiter.num_data check_dataiter = dataiter if num_samples is not None and N > num_samples: # Sample a sub iter check_dataiter = dataiter.getsubiter(num_samples) else: # Use the entire dataiter otherwise. check_dataiter.reset() acc_count = 0 num_samples = 0 for each_batch in check_dataiter: predict = self.model.forward_batch(each_batch, mode='test').asnumpy() # TODO(minjie): multiple labels. acc_count += np.sum( np.argmax(predict, axis=1) == each_batch.label[0]) num_samples += check_dataiter.batch_size return float(acc_count.asnumpy()) / num_samples
def check_accuracy(self, dataiter, num_samples=None): """ Check accuracy of the model on the provided data. Inputs: - dataiter: data iterator that can produce batches. - num_samples: If not None and dataiter has more than num_samples datapoints, subsample the data and only test the model on num_samples datapoints. Returns: - acc: Scalar giving the fraction of instances that were correctly classified by the model. """ # Maybe subsample the data N = dataiter.num_data check_dataiter = dataiter if num_samples is not None and N > num_samples: # Sample a sub iter check_dataiter = dataiter.getsubiter(num_samples) else: # Use the entire dataiter otherwise. check_dataiter.reset() acc_count = 0 num_samples = 0 for each_batch in check_dataiter: predict = self.model.forward_batch( each_batch, mode='test').asnumpy() # TODO(minjie): multiple labels. acc_count += np.sum( np.argmax( predict, axis=1) == each_batch.label[0]) num_samples += check_dataiter.batch_size return float(acc_count.asnumpy()) / num_samples
def softmax_cross_entropy(prob, label): """ Computes the cross entropy for softmax activation. Inputs: - prob: Probability, of shape (N, C) where x[i, j] is the probability for the jth class for the ith input. - label: Either of the followings: - One hot encoding of labels, of shape (N, C) - Label index of shape (N, ), each y[i] is the label of i^th example (0 <= y[i] < C) Returns a Value: - cross_entropy """ N = prob.shape[0] C = prob.shape[1] if len(label.shape) == 1: #convert it to one hot encoding onehot_label = np.zeros([N, C]) np.onehot_encode(label, onehot_label) else: onehot_label = label return -np.sum(np.log(prob) * onehot_label) / N
def gan_loss(*args): p_X = dmodel.forward(X, 'train') random_X = gmodel.forward(noise, 'train') p_random_X = dmodel.forward(random_X, 'train') value = np.log(clip(p_X, lower, upper)) + np.log(clip(1 - p_random_X, lower, upper)) loss = np.sum(value) / float(N) return loss
def loss(self, predict, y): # Add L2 regularization for all the weights. reg_loss = 0.0 for name, weight in self.params.items(): reg_loss += np.sum(weight**2) * 0.5 return layers.softmax_cross_entropy(predict, y) + weight_decay * reg_loss
def log_likelihood(weights, inputs, targets): logprobs = outputs(weights, inputs) loglik = 0.0 num_time_steps, num_examples, _ = inputs.shape for t in range(num_time_steps): loglik += np.sum(logprobs[t] * targets[t]) return loglik / (num_time_steps * num_examples)
def accuracy(p, l): import minpy.numpy as np if len(l.shape) == 1: return 1 - np.count_nonzero(p - l).val / float(p.shape[0]) else: inputs, labels = p, l return np.mean(np.sum((inputs - labels)**2, axis=1))
def getLaplacian(self, W): D = np.zeros((W.shape[0], W.shape[1])) L = np.zeros((W.shape[0], W.shape[1])) for i in range(W.shape[1]): D[i][i] = np.sum(W[:, i]) L = D - W return [D, L]
def affine_backward(dout, cache): """ Computes the backward pass for an affine layer. Inputs: - dout: Upstream derivative, of shape (N, M) - cache: Tuple of: - x: Input data, of shape (N, d_1, ... d_k) - w: Weights, of shape (D, M) Returns a tuple of: - dx: Gradient with respect to x, of shape (N, d1, ..., d_k) - dw: Gradient with respect to w, of shape (D, M) - db: Gradient with respect to b, of shape (M,) """ x, w, b = cache x_plain = np.reshape(x, (x.shape[0], -1)) db = np.sum(dout, axis=0) dx_plain = np.dot(dout, np.transpose(w)) dx = np.reshape(dx_plain, x.shape) dw = np.dot(np.transpose(x_plain), dout) return dx, dw, db
def train_loss(*args): inputs = args[0] softmax_label = args[1] probs = self.symbol_func(**self.make_mxnet_weight_dict( inputs, softmax_label, args[self.data_target_cnt:len(args)])) if softmax_label is None: return probs samples_num = X.shape[0] targets = np.zeros((samples_num, self.num_classes)) targets[np.arange(samples_num), softmax_label] = 1 loss = -np.sum(targets * np.log(probs)) / samples_num for i in self.get_index_reg_weight(): loss = loss + np.sum(0.5 * args[i]**2 * self.reg) return loss
def forward(self, X): a = np.dot(self.params['fc1'], X.T) h = np.maximum(0, a) logits = np.dot(h.T, self.params['policy_fc_last'].T) ps = np.exp(logits - np.max(logits, axis=1, keepdims=True)) ps /= np.sum(ps, axis=1, keepdims=True) vs = np.dot(h.T, self.params['vf_fc_last'].T) + self.params['vf_fc_last_bias'] return ps, vs
def probs(scores): """ Calculates the probabilities out of a neural networks class scores. :param scores: The score matrix of form (N x K), where N is the number of observations and K is the number of classes. :return: The probabilities of the same form as the input scores. """ exp_scores = np.exp(scores) return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
def blob_normalization(X, settings, gamma, beta, mode='train', epsilon=1e-5, momentum=0.9, running_mean=None, running_variance=None): N, D = map(int, X.shape) size = N * D if running_mean is None: running_mean = np.zeros(1) if running_variance is None: running_variance = np.zeros(1) if mode == 'train': if 'shared_mean' in settings: mean = np.sum(X) / size else: mean = np.sum(X, axis=0) / N mean = np.reshape(mean, (1, D)) centered_X = X - mean if 'shared_deviation' in settings: variance = np.sum(centered_X**2) / size else: variance = np.sum(centered_X**2, axis=0) / N variance = np.reshape(variance, (1, D)) deviation = variance**0.5 rescaled_X = centered_X / deviation out = gamma * rescaled_X + beta running_mean = momentum * running_mean + (1.0 - momentum) * mean running_variance = momentum * running_variance + (1.0 - momentum) * variance elif mode == 'test': X_hat = (X - running_mean) / np.sqrt(running_variance + epsilon) out = gamma * X_hat + beta return out, running_mean, running_variance
def softmax_crossentropy(x, y): EPSI = 1e-6 batch_size, seq_len, prob_dim = x.shape x = x.reshape((x.shape[0] * x.shape[1], x.shape[2])) y = y.reshape((y.shape[0] * y.shape[1], )) #print x.shape, y.shape # x should be (batch, prob) # y should be (batch, ) x_dev = x - np.max(x, axis=1, keepdims=True) # minpy doesn't support x.max() sm = x_dev - np.log(EPSI + np.sum(np.exp(x_dev), axis=1, keepdims=True)) ids = np.arange(0, y.shape[0]) * seq_len + y ce = -np.sum(sm.reshape((sm.shape[0] * sm.shape[1], ))[ids]) / ( 1.0 * y.shape[0]) # minpy doesn't support -1 in shape inference return ce
def loss(caffe_layer_specs, X, T): # original code: # log_prior = -L2_reg * np.dot(W_vect, W_vect) log_prior = 0 for caffe_layer in caffe_layer_specs: log_prior += -L2_reg * np.dot(caffe_layer.get_learnable_params()[0], caffe_layer.get_learnable_params()[0]) log_lik = np.sum(predictions(caffe_layer_specs, X) * T) return - log_prior - log_lik
def regularization_loss(w, lambda_): """ Calculates the regularization loss of the output weights. Regularization loss is used to favor smaller magnitudes of weights. :param w: The weight matrix of the output layer of form (H x K), where H is the size of the previous layer and K is the number of classes. :param lambda_: A hyperparameter used to control the magnitude of the weight. :return: The regularization loss. """ return .5 * lambda_ * np.sum(np.square(w))
def softmax_cross_entropy(prob, label): N = prob.shape[0] C = prob.shape[1] if len(label.shape) == 1: #convert it to one hot encoding onehot_label = np.zeros([N, C]) np.onehot_encode(label, onehot_label) else: onehot_label = label return -np.sum(np.log(prob) * onehot_label) / N
def grad(g): import numpy as np y = label.astype(int) probs = np.exp(x - np.max(x, axis=1, keepdims=True)) probs /= np.sum(probs, axis=1, keepdims=True) N = x.shape[0] dx = probs.copy() dx[np.arange(N), y] -= 1 dx /= N return dx
def trainOneEpoch(self, input, target): """Train network for given input and target with gradient descent""" w = self.weights loss = self.loss(input, target) dw = self.gradient(self.weights, input, target) # gradient descent w -= self.lr * dw deltaW = np.sum((self.weights - w)**2) # change in weights self.weights = w # update weights return loss, deltaW
def quick_grad_check(fun, arg0, extra_args=(), kwargs={}, verbose=True, eps=EPS, rtol=RTOL, atol=ATOL, rs=None): """Checks the gradient of a function (w.r.t. to its first arg) in a random direction""" if verbose: print("Checking gradient of {0} at {1}".format(fun, arg0)) if rs is None: rs = nnp.random.RandomState() random_dir = rs.standard_normal(nnp.shape(arg0)) random_dir = random_dir / nnp.sqrt(nnp.sum(random_dir * random_dir)) if not extra_args == (): unary_fun = lambda x: fun(arg0 + x * random_dir, extra_args) numeric_grad = (unary_fun(eps / 2) - unary_fun(-eps / 2)) / eps analytic_grad = np.sum(grad(fun)(arg0, extra_args) * random_dir) else: unary_fun = lambda x: fun(arg0 + x * random_dir) numeric_grad = (unary_fun(eps / 2) - unary_fun(-eps / 2)) / eps analytic_grad = np.sum(grad(fun)(arg0) * random_dir) if isinstance(numeric_grad, minpy.array.Number): assert abs((analytic_grad - numeric_grad).get_data(None)) < atol and abs((analytic_grad - numeric_grad).get_data(None)) < abs((analytic_grad * rtol).get_data(None)), \ "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad) elif isinstance(numeric_grad, minpy.array.Array): assert nnp.prod(nnp.shape(analytic_grad.asnumpy()) [:]) == 1, "Currently only support check loss" assert abs((analytic_grad - numeric_grad).asnumpy()) < atol and abs((analytic_grad - numeric_grad).asnumpy()) < abs((analytic_grad * rtol).asnumpy()), \ "Check failed! nd={0}, ad={1}".format(numeric_grad, analytic_grad) else: assert False if verbose: print("Gradient projection OK (numeric grad: {0}, analytic grad: {1})". format(numeric_grad, analytic_grad))
def backpropagation(x, s, y, hidden_layers, wh, bh, w_out, b_out, alpha): """ Performs the backpropagation of a neural network. :param x: The input data of form (N x D), where N is the number of observations an D is the dimensionality. :param s: The score matrix of form (N x K), where N is the number of observations and K is the number of classes. :param y: The ground truth labels for each observation. :param hidden_layers: An array containing the values of each hidden layer as a vector. :param wh: The weights of each hidden layer connection as array. Each weight is a matrix of (H_i-1 ... H_i), where H_i-1 is the size of the previous hidden layer (or the input layer) and H_i is the size of the corresponding hidden layer.. :param bh: The biases of each hidden layer as array. Each bias is a vector of the same length of the corresponding hidden layer. :param w_out: The weight of the output layer as matrix of form (H x K), where H is the size of the last hidden layer and K is the number of classes. :param b_out: The bias of the output layer as vector of length K, where K is the number of classes. :param alpha: The factor by which negative inputs are scaled in ReLU activations. Set to 0 to avoid leaky ReLU. :return: The backpropagation returns relevant gradients as a tuple containing the following values: * An array containing the gradients for the connection weights of each hidden layer of the same form as `wh`. * An array containing the gradients for the biases of each hidden layer of the same form as `bh`. * An array containing the gradients for the connection weights of the output layer of the same form as `w_out`. * An array containing the gradients for the biases of the output layer of the same form as `b_out`. """ dscores = cross_entropy_loss_gradient(s, y) dw_out2 = hidden_layers[-1].T.dot(dscores) db_out2 = np.sum(dscores, axis=0, keepdims=True) dhiddens = {} dwh2 = [np.full(w_i.shape, .0) for w_i in wh] dbh2 = [np.empty(b_i.shape) for b_i in bh] for h in range(len(hidden_layers) - 1, -1, -1): if h == len(hidden_layers) - 1: dhidden = dscores.dot(w_out.T) else: dhidden = dhiddens[h + 1].dot(wh[h + 1].T) dhidden[hidden_layers[h] < 0] = alpha dhiddens[h] = dhidden if h == 0: dwh2[h] = x.T.dot(dhidden) else: dwh2[h] = hidden_layers[h - 1].T.dot(dhidden) dbh2[h] = np.sum(dhidden, axis=0, keepdims=True) dw_out2 += lambda_ * w_out return dwh2, dbh2, dw_out2, db_out2
def softmax_loss(x, y): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C Returns a tuple of: - loss: Scalar giving the loss """ #TODO: Missing Max Operator probs = np.exp(x - np.expand_dims(np.max(x, axis=1), axis = 1)) probs = probs / np.expand_dims(np.sum(probs, axis=1), axis = 1) N = x.shape[0] loss = -np.sum(np.log(probs[np.arange(N), y])) / N return loss
def l2_loss(x, label): """ The Mean Square Error loss for regression. """ N = x.shape[0] C = x.shape[1] if len(label.shape) == 1: #convert it to one hot encoding onehot_label = np.zeros([N, C]) np.onehot_encode(label, onehot_label) else: onehot_label = label return np.sum((x - onehot_label) ** 2) / N
def temporal_softmax_loss(x, y, mask, verbose=False): """ A temporal version of softmax loss for use in RNNs. We assume that we are making predictions over a vocabulary of size V for each timestep of a timeseries of length T, over a minibatch of size N. The input x gives scores for all vocabulary elements at all timesteps, and y gives the indices of the ground-truth element at each timestep. We use a cross-entropy loss at each timestep, summing the loss over all timesteps and averaging across the minibatch. As an additional complication, we may want to ignore the model output at some timesteps, since sequences of different length may have been combined into a minibatch and padded with NULL tokens. The optional mask argument tells us which elements should contribute to the loss. Inputs: - x: Input scores, of shape (N, T, V) - y: Ground-truth indices, of shape (N, T) where each element is in the range 0 <= y[i, t] < V - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not the scores at x[i, t] should contribute to the loss. Returns a tuple of: - loss: Scalar giving loss - dx: Gradient of loss with respect to scores x. """ N, T, V = x.shape x_flat = x.reshape(N * T, V) y_flat = y.reshape(N * T) mask_flat = mask.reshape(N * T) probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True)) probs = probs / np.sum(probs, axis=1, keepdims=True) loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N return loss
def check_accuracy(self, dataiter, num_samples=None): """ Check accuracy of the model on the provided data. Parameters ---------- dataiter data iterator that can produce batches. num_samples If not None and dataiter has more than num_samples datapoints, subsample the data and only test the model on num_samples datapoints. Returns ------- acc Scalar giving the fraction of instances that were correctly classified by the model. """ # Maybe subsample the data N = dataiter.num_data check_dataiter = dataiter if num_samples is not None and N > num_samples: # Sample a sub iter check_dataiter = dataiter.getsubiter(num_samples) else: # Use the entire dataiter otherwise. check_dataiter.reset() if self.task_type is 'classification': acc_count = 0 num_samples = 0 for each_batch in check_dataiter: predict = self.model.forward_batch(each_batch, mode='test').asnumpy() # TODO(minjie): multiple labels. acc_count += np.sum(np.argmax(predict, axis=1) == each_batch.label[0]) num_samples += check_dataiter.batch_size return float(acc_count.asnumpy()) / num_samples elif self.task_type is 'regression': loss = 0 batch_count = 0 for each_batch in check_dataiter: predict = self.model.forward_batch(each_batch, mode='test').asnumpy() loss += self.model.loss(predict, each_batch.label[0]) batch_count += 1 return float(loss.asnumpy()) / batch_count else: raise ValueError('Task type is either classification or regression.')
def test_sum_forward(): np_x = py_np.zeros((2, 10)) np_w = py_np.zeros((10, 3)) np_b = py_np.zeros(3) x = NumpyVarToMinpy(np_x) w = NumpyVarToMinpy(np_w) b = NumpyVarToMinpy(np_b) x_plain = np.reshape(x, (x.shape[0], -1)) out0 = np.dot(x_plain, w) out = out0 + b np_out = MinpyVarToNumpy(out) var = py_np.random.randn(2, 3) tmp = NumpyVarToMinpy(var) sum_tmp = np.sum(tmp, axis = 0) sum_py = MinpyVarToNumpy(sum_tmp)
def loss_theta(layer): pred = predict(layer, inputs) return -np.sum(np.log(pred * targets)) # negative log likelihood
def loss(w, x): prob = predict(w, x) return -np.sum(np.log(prob) * t) / 10000 + 0.5 * w * w
def loss(self, predict, y): # Add L2 regularization for all the weights. reg_loss = 0.0 for name, weight in self.params.items(): reg_loss += np.sum(weight ** 2) return layers.softmax_cross_entropy(predict, y) + 0.5 * weight_decay * reg_loss
def sigmoid(x): return np.multiply(0.5, np.add(np.tanh(x), 1)) x = mx.sym.Variable(name='x') fc = mx.sym.FullyConnected(name='fc', data=x) #fc = mx.sym.FullyConnected(name='fc', data=x, num_hidden=inputs.shape[1]) act = mx.sym.Activation(data=fc, act_type='sigmoid') f = core.function(act) def predict(weights, inputs): return f(x=inputs, fc_weight=weights, ctx=mx.cpu()) def training_loss(weights, inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) return -np.sum(np.log(label_probabilities)) xshape = (256, 500) wshape = (500, 250) tshape = (256, 250) inputs = np.random.rand(*xshape) - 0.5 targets = np.random.randint(0, 2, size=tshape) weights = np.random.rand(*wshape) - 0.5 training_gradient_fun = core.grad(training_loss) print('Initial loss: {}'.format(training_loss(weights, inputs))) for i in range(100): gr = training_gradient_fun(weights, inputs) #print('Training gradient: {}'.format(gr)) weights -= gr * 0.1
def check_fn(x): y = x + 1 print(mp.exp(y)) return mp.sum(2 * y)
def check_fn(x): return mp.sum(x + x)
def training_loss(inputs, targets, fc_weight, fc_bias, conv_weight, conv_bias): preds = predict(inputs, fc_weight, fc_bias, conv_weight, conv_bias) label_probabilities = preds * targets + (1 - preds) * (1 - targets) return -np.sum(np.log(label_probabilities))
def training_loss(weights, inputs): preds = predict(weights, inputs) label_probabilities = preds * targets + (1 - preds) * (1 - targets) l = -np.sum(np.log(label_probabilities)) return l
def batchnorm_forward(x, gamma, beta, bn_param): """ Forward pass for batch normalization. During training the sample mean and (uncorrected) sample variance are computed from minibatch statistics and used to normalize the incoming data. During training we also keep an exponentially decaying running mean of the mean and variance of each feature, and these averages are used to normalize data at test-time. At each timestep we update the running averages for mean and variance using an exponential decay based on the momentum parameter: running_mean = momentum * running_mean + (1 - momentum) * sample_mean running_var = momentum * running_var + (1 - momentum) * sample_var Note that the batch normalization paper suggests a different test-time behavior: they compute sample mean and variance for each feature using a large number of training images rather than using a running average. For this implementation we have chosen to use running averages instead since they do not require an additional estimation step; the torch7 implementation of batch normalization also uses running averages. Input: - x: Data of shape (N, D) - gamma: Scale parameter of shape (D,) - beta: Shift paremeter of shape (D,) - bn_param: Dictionary with the following keys: - mode: 'train' or 'test'; required - eps: Constant for numeric stability - momentum: Constant for running mean / variance. - running_mean: Array of shape (D,) giving running mean of features - running_var Array of shape (D,) giving running variance of features Returns a tuple of: - out: of shape (N, D) - cache: A tuple of values needed in the backward pass """ mode = bn_param['mode'] eps = bn_param.get('eps', 1e-5) momentum = bn_param.get('momentum', 0.9) N, D = x.shape running_mean = bn_param.get('running_mean', np.zeros(D)) running_var = bn_param.get('running_var', np.zeros(D)) out = None if mode == 'train': mean = np.sum(x, axis = 0)/float(N) x_mean = (x - mean) sqr_x_mean = x_mean ** 2 var = np.sum(sqr_x_mean, axis = 0)/float(N) sqrt_var = np.sqrt(var + eps) inv_sqrt_var = 1.0/sqrt_var x_hat = x_mean * inv_sqrt_var out = gamma * x_hat + beta running_mean = momentum*running_mean + (1.0 - momentum) * mean running_var = momentum*running_var + (1.0 - momentum) * var elif mode == 'test': x_hat = (x - running_mean)/np.sqrt(running_var + eps) out = gamma * x_hat + beta else: raise ValueError('Invalid forward batchnorm mode "%s"' % mode) # Store the updated running means back into bn_param bn_param['running_mean'] = running_mean bn_param['running_var'] = running_var return out