def _softmax_loss(self, X, y, *args): N = X.shape[0] scores = self._forward(X, *args) scores = np.exp(scores - np.max(scores, axis=1, keepdims=True)) prob = scores / np.sum(scores, axis=1, keepdims=True) loss = np.sum(-np.log(prob[np.arange(N), y])) / float(N) return loss
def softmax_loss(x, y): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Either of the followings: - One hot encoding of labels, of shape (N, C) - Label index of shape (N, ), each y[i] is the label of i^th example (0 <= y[i] < C) Returns a tuple of: - loss: Scalar giving the loss """ N = x.shape[0] C = x.shape[1] if len(y.shape) == 1: #convert it to one hot encoding onehot_y = np.zeros([N, C]) np.onehot_encode(y, onehot_y) else: onehot_y = y probs = x - np.max(x, axis=1, keepdims=True) loss = -np.sum(probs * onehot_y) / N loss += np.sum(np.log(np.sum(np.exp(probs), axis=1, keepdims=True))) / N return loss
def softmax_loss(x, y): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C Returns a tuple of: - loss: Scalar giving the loss - dx: Gradient of the loss with respect to x """ #np.expand_dims(correct_class_scores, axis = 1) #probs = np.exp(x - np.max(x, axis=1, keepdims=True)) #print "x.shape", x.shape #Somehow Buggy. Max doesn't work. probs = np.exp(x - np.max(x, axis=1)) #probs /= np.expand_dims(np.sum(probs, axis=1), axis = 1) probs /= np.expand_dims(np.sum(probs, axis=1), axis = 1) N = x.shape[0] loss = -np.sum(np.log(probs[np.arange(N), y])) / N dx = probs.copy() dx[np.arange(N), y] -= 1 dx /= N return loss, dx
def softmax_loss(x, y): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C Returns a tuple of: - loss: Scalar giving the loss - dx: Gradient of the loss with respect to x """ #np.expand_dims(correct_class_scores, axis = 1) #probs = np.exp(x - np.max(x, axis=1, keepdims=True)) #print "x.shape", x.shape #Somehow Buggy. Max doesn't work. probs = np.exp(x - np.max(x, axis=1)) #probs /= np.expand_dims(np.sum(probs, axis=1), axis = 1) probs /= np.expand_dims(np.sum(probs, axis=1), axis=1) N = x.shape[0] loss = -np.sum(np.log(probs[np.arange(N), y])) / N dx = probs.copy() dx[np.arange(N), y] -= 1 dx /= N return loss, dx
def rel_error(x, y): """Returns relative error""" if isinstance(x, (int, float, Number)): x = float(x) y = float(y) return abs(x - y) / max(1e-8, abs(x) + abs(y)) else: return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))
def forward(self, X): a = np.dot(self.params['fc1'], X.T) h = np.maximum(0, a) logits = np.dot(h.T, self.params['policy_fc_last'].T) ps = np.exp(logits - np.max(logits, axis=1, keepdims=True)) ps /= np.sum(ps, axis=1, keepdims=True) vs = np.dot(h.T, self.params['vf_fc_last'].T) + self.params['vf_fc_last_bias'] return ps, vs
def softmax(x, y): import numpy as np y = y.astype(int) probs = np.exp(x - np.max(x, axis=1, keepdims=True)) probs /= np.sum(probs, axis=1, keepdims=True) N = x.shape[0] loss = -np.sum(np.log(probs[np.arange(N), y])) / N return loss
def softmax_probability(p, channel): N, C = p.shape p -= np.max(p, axis=1).reshape((N, 1)) code = np.zeros((N, C)) np.onehot_encode(channel, code) p = np.exp(p) selected_p = p * code total_p = np.sum(p, axis=1).reshape((N, 1)) return np.sum(selected_p / total_p, axis=1)
def softmax_crossentropy(x, y): # x should be (batch, prob) # y should be (batch, ) x_dev = x - np.max(x, axis=1, keepdims=True) # minpy doesn't support x.max() sm = x_dev - np.log(np.sum(np.exp(x_dev), axis=1, keepdims=True)) ids = np.arange(0, y.shape[0])*x.shape[1] + y ce = -np.sum(sm.reshape((sm.shape[0]*sm.shape[1],))[ids])/(1.0*y.shape[0]) # minpy doesn't support -1 in shape inference return ce
def _loss_function(*args): normal_loss = model.loss(model.forward(X, 'train'), Y) noisy_output = model.forward(noisy_X, 'train') noisy_output -= np.max(noisy_output, axis=1).reshape((K, 1)) noisy_output = np.exp(noisy_output) model_p_noisy_X = noisy_output / np.sum(noisy_output, axis=1).reshape( (K, 1)) kl = KL(1.0 / N_CLASSES, model_p_noisy_X) noisy_loss = gamma * np.sum(kl) / float(K) return gamma * normal_loss + (1 - gamma) * noisy_loss
def grad(g): import numpy as np y = label.astype(int) probs = np.exp(x - np.max(x, axis=1, keepdims=True)) probs /= np.sum(probs, axis=1, keepdims=True) N = x.shape[0] dx = probs.copy() dx[np.arange(N), y] -= 1 dx /= N return dx
def softmax_crossentropy(x, y): EPSI = 1e-6 batch_size, seq_len, prob_dim = x.shape x = x.reshape((x.shape[0] * x.shape[1], x.shape[2])) y = y.reshape((y.shape[0] * y.shape[1], )) #print x.shape, y.shape # x should be (batch, prob) # y should be (batch, ) x_dev = x - np.max(x, axis=1, keepdims=True) # minpy doesn't support x.max() sm = x_dev - np.log(EPSI + np.sum(np.exp(x_dev), axis=1, keepdims=True)) ids = np.arange(0, y.shape[0]) * seq_len + y ce = -np.sum(sm.reshape((sm.shape[0] * sm.shape[1], ))[ids]) / ( 1.0 * y.shape[0]) # minpy doesn't support -1 in shape inference return ce
def softmax_loss(x, y): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C Returns a tuple of: - loss: Scalar giving the loss """ #TODO: Missing Max Operator probs = np.exp(x - np.expand_dims(np.max(x, axis=1), axis = 1)) probs = probs / np.expand_dims(np.sum(probs, axis=1), axis = 1) N = x.shape[0] loss = -np.sum(np.log(probs[np.arange(N), y])) / N return loss
def softmax_loss(x, y): """ Computes the loss and gradient for softmax classification. Inputs: - x: Input data, of shape (N, C) where x[i, j] is the score for the jth class for the ith input. - y: Vector of labels, of shape (N,) where y[i] is the label for x[i] and 0 <= y[i] < C Returns a tuple of: - loss: Scalar giving the loss """ #TODO: Missing Max Operator probs = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1)) probs = probs / np.expand_dims(np.sum(probs, axis=1), axis=1) N = x.shape[0] loss = -np.sum(np.log(probs[np.arange(N), y])) / N return loss
def temporal_softmax_loss(x, y, mask, verbose=False): """ A temporal version of softmax loss for use in RNNs. We assume that we are making predictions over a vocabulary of size V for each timestep of a timeseries of length T, over a minibatch of size N. The input x gives scores for all vocabulary elements at all timesteps, and y gives the indices of the ground-truth element at each timestep. We use a cross-entropy loss at each timestep, summing the loss over all timesteps and averaging across the minibatch. As an additional complication, we may want to ignore the model output at some timesteps, since sequences of different length may have been combined into a minibatch and padded with NULL tokens. The optional mask argument tells us which elements should contribute to the loss. Inputs: - x: Input scores, of shape (N, T, V) - y: Ground-truth indices, of shape (N, T) where each element is in the range 0 <= y[i, t] < V - mask: Boolean array of shape (N, T) where mask[i, t] tells whether or not the scores at x[i, t] should contribute to the loss. Returns a tuple of: - loss: Scalar giving loss - dx: Gradient of loss with respect to scores x. """ N, T, V = x.shape x_flat = x.reshape(N * T, V) y_flat = y.reshape(N * T) mask_flat = mask.reshape(N * T) probs = np.exp(x_flat - np.max(x_flat, axis=1, keepdims=True)) probs = probs / np.sum(probs, axis=1, keepdims=True) loss = -np.sum(mask_flat * np.log(probs[np.arange(N * T), y_flat])) / N return loss
def red3(x): return mp.max(x, axis=1, keepdims=True)
def logsumexp(X, axis=1): max_X = np.max(X) return max_X + np.log(np.sum(np.exp(X - max_X), axis=axis, keepdims=True))
def logsoftmax(x, valid_idx): x[np.array(valid_idx)] += 1e6 x_max = np.max(x) return x - x_max - np.log(np.sum(np.exp(x - x_max)))
minimum = {key: [] for key in model.params} maximum = {key: [] for key in model.params} for i in range(iterations): X_batch = data[0][batch_index * batch_size:(batch_index + 1) * batch_size] Y_batch = data[1][batch_index * batch_size:(batch_index + 1) * batch_size] batch_index = (batch_index + 1) % batches gradients, loss = gradient_loss(model, X_batch, Y_batch) loss = loss.asnumpy()[0] loss_history.append(loss) for key, value in zip(model.params.keys(), gradients): mean[key].append(np.mean(value).asnumpy()) std[key].append(np.std(value).asnumpy()) L_2[key].append(np.mean(value**2).asnumpy()) minimum[key].append(np.min(value).asnumpy()) maximum[key].append(np.max(value).asnumpy()) updater.update(gradients) if (i + 1) % rescaling_interval == 0: rescale(mlp, data[2], model.params) # validation data print 'rescaled' if (i + 1) % interval == 0: print 'iteration %d loss %f' % (i + 1, loss) pickle.dump((loss_history, mean, std, L_2, minimum, maximum), open('dr-g-norm-%d' % rescaling_interval, 'wb'))
def red1(x): return mp.max(x)
def red2(x): return mp.max(x, axis=1)
def logsumexp(X, axis, keepdims=False): max_X = np.max(X) return max_X + np.log(np.sum(np.exp(X - max_X), axis=axis, keepdims=keepdims))
def red5(x): return mp.max(x, axis=0, keepdims=True)
def red4(x): return mp.max(x, axis=0)