def costfunc_gpu_ReLU(x, *args): num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args num_weights1 = (num_input+1)*num_hidden x = gpu.garray(x) inputs = gpu.garray(inputs) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden_sum = gpu.dot(weights1, data) hidden_activation = gpu.log(1+hidden_sum.exp()) p_avg = gpu.sum(hidden_activation,axis=1)/nData hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:,1:shape(weights1)[1]] regularized_penalty2 = weights2[:,1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - inputs)*(output - inputs) KL = gpu.sum(sparsityParam*gpu.log(sparsityParam/p_avg) + (1-sparsityParam)*gpu.log((1-sparsityParam)/(1-p_avg))) cost = gpu.sum(output_target_diff)/(2*nData) + 0.5 * lambda_val * (gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta*KL print 'ReLU Linear Decoder Cost: ', cost return cost
def bKL(x, y): """ Kullback-Leibler divergence between two bernoulli random vectors x and y. Note: Not symmetric. """ return x * gpu.log(x / y) + (1 - x) * gpu.log((1 - x) / (1 - y))
def KL(rho, rho_target, KL_flat): y = rho.copy() if KL_flat: y[gp.where(y < rho_target)] = rho_target * gp.ones( y[gp.where(y < rho_target)].shape) return rho_target * gp.log(rho_target / y) + (1 - rho_target) * gp.log( (1 - rho_target) / (1 - y))
def reconstruction_cross_entropy(self, vis): """Returns the cross entropy between vis and its reconstruction obtained by one step of Gibbs sampling.""" _, sampled_p_vis = self.gibbs_sample(vis, 1) cross_entropy = gp.mean(vis * gp.log(sampled_p_vis) - (1 - vis) * gp.log(1-sampled_p_vis), axis=1) return cross_entropy
def forward(self): """ Perform a forward pass to calculate the activation (objective) """ numExamples = self.output_port.getOutput().shape[0] self.objective = -gpu.sum(gpu.garray(self.target_port.getOutput()) * gpu.log(gpu.garray(self.output_port.getOutput()))) self.objective += -gpu.sum((1.0 - self.target_port.getOutput())*(gpu.log(1.000001 - self.output_port.getOutput()))) self.objective /= numExamples
def forward(self): """ Perform a forward pass to calculate the activation (objective) """ numExamples = self.output_port.getOutput().shape[0] self.objective = -gpu.sum( gpu.garray(self.target_port.getOutput()) * gpu.log(gpu.garray(self.output_port.getOutput()))) self.objective += -gpu.sum( (1.0 - self.target_port.getOutput()) * (gpu.log(1.000001 - self.output_port.getOutput()))) self.objective /= numExamples
def costfunc_gpu(x, *args): num_input, num_hidden, num_output, inputs, noNoiseData, lambda_val, sparsityParam, beta = args num_weights1 = (num_input + 1) * num_hidden x = gpu.garray(x) # randomNoise = random.random_sample(shape(inputs)) # criteriaTable = randomNoise > 0.32 # inputs = inputs * criteriaTable inputs = gpu.garray(inputs) noNoiseData = gpu.garray(noNoiseData) #weights1 = gpu.garray(reshape(x[0:num_weights1],(num_hidden,num_input+1))) weights1 = x[0:num_weights1].reshape((num_hidden, num_input + 1)) #weights2 = gpu.garray(reshape(x[num_weights1:shape(x)[0]], (num_output,num_hidden+1))) weights2 = x[num_weights1:shape(x)[0]].reshape( (num_output, num_hidden + 1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1, nData)), inputs), axis=0) hidden_sum = gpu.dot(weights1, data) hidden_activation = hidden_sum.logistic() p_avg = gpu.sum(hidden_activation, axis=1) / nData hidden_activation = gpu.concatenate((gpu.ones( (1, nData)), hidden_activation), axis=0) output = gpu.dot(weights2, hidden_activation) regularized_penalty1 = weights1[:, 1:shape(weights1)[1]] regularized_penalty2 = weights2[:, 1:shape(weights2)[1]] regularized_penalty1 = regularized_penalty1 * regularized_penalty1 regularized_penalty2 = regularized_penalty2 * regularized_penalty2 output_target_diff = (output - noNoiseData) * (output - noNoiseData) KL = gpu.sum(sparsityParam * gpu.log(sparsityParam / p_avg) + (1 - sparsityParam) * gpu.log((1 - sparsityParam) / (1 - p_avg))) cost = gpu.sum(output_target_diff) / (2 * nData) + 0.5 * lambda_val * ( gpu.sum(regularized_penalty1) + gpu.sum(regularized_penalty2)) + beta * KL print 'GPU Linear Denoising Decoder Cost: ', cost del x del inputs del noNoiseData del data del hidden_sum del hidden_activation del p_avg del output del regularized_penalty1 del regularized_penalty2 del weights1 del weights2 del output_target_diff gpu.free_reuse_cache() return cost
def from_moments(moments, weights_std=0.): """Initialize an RBM so the visible and hidden biases match the given moments and the weights are set to small random values.""" assert isinstance(moments, Moments) assert np.allclose(moments.expect_prod.as_numpy_array(), gnp.outer(moments.expect_vis, moments.expect_hid).as_numpy_array()) vbias = gnp.log(moments.expect_vis) - gnp.log(1. - moments.expect_vis) hbias = gnp.log(moments.expect_hid) - gnp.log(1. - moments.expect_hid) assert np.all(np.isfinite(vbias.as_numpy_array())) and np.all(np.isfinite(hbias.as_numpy_array())) if weights_std > 0.: weights = gnp.garray(np.random.normal(0., weights_std, size=(vbias.size, hbias.size))) else: weights = gnp.zeros((vbias.size, hbias.size)) return RBM(vbias, hbias, weights)
def rect_log(x, computeGrad = False): if (not computeGrad): f = gp.log(x*(x>0)+1)* (x>0) return f g = (x>0) / (gp.exp(x)) return g
def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False): pred = gnp.as_garray(pred) y = gnp.exp(pred - pred.max(axis=1)[:, gnp.newaxis]) y = y / y.sum(axis=1)[:, gnp.newaxis] return -(self.target * gnp.log(y + _SMALL_CONSTANT)).sum(), y - self.target
def rect_log(x, computeGrad=False): if (not computeGrad): f = gp.log(x * (x > 0) + 1) * (x > 0) return f g = (x > 0) / (gp.exp(x)) return g
def log_exp_sum(x, axis=1): x_max = x.max(axis=axis) if isinstance(x, gnp.garray): return (x_max + gnp.log( gnp.exp(x - x_max[:, gnp.newaxis]).sum(axis=axis))).asarray() else: return x_max + np.log(np.exp(x - x_max[:, np.newaxis]).sum(axis=axis))
def forward_prop(self, X, add_noise=False, compute_loss=False, is_test=True): """ Compute the forward propagation step that maps the input data matrix X into the output. Loss and loss gradient will be computed when compute_loss set to True. Note that the loss is applied on nonlinearity activation, rather than the final output by default, unless loss_after_nonlin is set to True. """ if self.params.dropout > 0 and add_noise: self.dropout_mask = gnp.rand(X.shape[0], X.shape[1]) > self.params.dropout self.inputs = X * self.dropout_mask else: self.inputs = X self.noise_added = add_noise if not self.use_batch_normalization: self.activation = self.inputs.dot(self.params.W) + self.params.b self.output = self.nonlin.forward_prop(self.activation) if self.sparsity_weight > 0: self._sparsity_current = self._sparsity_smoothing * self.output.mean(axis=0) \ + (1 - self._sparsity_smoothing) * self._sparsity_current self._sparsity_objective = (- self.sparsity * gnp.log(self._sparsity_current + 1e-20) \ - (1 - self.sparsity) * gnp.log(1 - self._sparsity_current + 1e-20)).sum() * self.sparsity_weight else: self.activation = self.inputs.dot(self.params.W) self.bn_output = self.bn_layer.forward_prop(self.activation, is_test=is_test) self.output = self.nonlin.forward_prop(self.bn_output) if compute_loss and self.loss is not None: if self.loss_after_nonlin: self.loss_value, self.loss_grad = self.loss.compute_loss_and_grad( self.output, compute_grad=True) else: self.loss_value, self.loss_grad = self.loss.compute_loss_and_grad( self.activation if not self.use_batch_normalization else self.bn_output, compute_grad=True) self.loss_computed = True return self.output
def getErrorLoss(self, a0, a2,factor=1.0): """ error is measured by neg log likelihood """ pow=a2**a0 p=gp.exp(-a2)*pow/self.factor[a0] l=gp.log(p) return -l.sum(axis=1).mean()*factor
def getErrorLoss(self, a0, a2, factor=1.0): """ error is measured by neg log likelihood """ pow = a2**a0 p = gp.exp(-a2) * pow / self.factor[a0] l = gp.log(p) return -l.sum(axis=1).mean() * factor
def pseudo_likelihood_for_bit(self, vis, i): """Returns the likelihood of bit i of vis given all other bits of vis.""" fe = self.free_energy(vis) vis_flip = vis vis_flip[:,i] = 1 - vis[:,i] fe_flip = self.free_energy(vis_flip) pl = gp.log(gp.logistic(fe_flip - fe)) return pl
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray( reshape(x[num_weights_L1:num_weights_L2 + num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_softmax = gpu.garray( reshape(x[num_weights_L2 + num_weights_L1:shape(x)[0]], (numClasses, l2Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_activation_L1), axis=0) hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = hidden_sum_L2.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) temp = groundTruth * gpu.log(predictions) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:, 1:shape(theta_L2)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 cost = -1 * gpu.sum(temp) / numCases + 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2) ) + 0.5 * lambda_softmax * gpu.sum(theta_softmax * theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_L2 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 del regularized_penalty_L2 gpu.free_reuse_cache() return cost
def log_exp_sum_1d(x): """ This computes log(exp(x_1) + exp(x_2) + ... + exp(x_n)) as x* + log(exp(x_1-x*) + exp(x_2-x*) + ... + exp(x_n-x*)), where x* is the max over all x_i. This can avoid numerical problems. """ x_max = x.max() if isinstance(x, gnp.garray): return x_max + gnp.log(gnp.exp(x - x_max).sum()) else: return x_max + np.log(np.exp(x - x_max).sum())
def from_moments(moments, weights_std=0.): """Initialize an RBM so the visible and hidden biases match the given moments and the weights are set to small random values.""" assert isinstance(moments, Moments) assert np.allclose( moments.expect_prod.as_numpy_array(), gnp.outer(moments.expect_vis, moments.expect_hid).as_numpy_array()) vbias = gnp.log(moments.expect_vis) - gnp.log(1. - moments.expect_vis) hbias = gnp.log(moments.expect_hid) - gnp.log(1. - moments.expect_hid) assert np.all(np.isfinite(vbias.as_numpy_array())) and np.all( np.isfinite(hbias.as_numpy_array())) if weights_std > 0.: weights = gnp.garray( np.random.normal(0., weights_std, size=(vbias.size, hbias.size))) else: weights = gnp.zeros((vbias.size, hbias.size)) return RBM(vbias, hbias, weights)
def dbn_supervised_predict_exact(ws_vh, ws_v, ws_h, x): """ Predict the class label of input x from supervised DBN Uses the exact method mentioned in section 6.2 of Hinton, Osindero, Teh 2006 The free energy formula is taken from http://deeplearning.net/tutorial/rbm.html x: Input data. (NxD matrix) """ L = len(ws_vh) N = x.shape[0] # make a forward pass to get from input layer to visible layer of top level # RBM h_prev = x.T # forward (bottom-up) pass, (use deterministic (we pass the activations, not # the stochastically sampled steps) forward pass) for l in range(L - 1): ah = gnp.dot(ws_vh[l].T, h_prev) + ws_h[l] h_prev = gnp.logistic(ah) H = ws_vh[-1].shape[0] # number of visible units top level RBM Hx = h_prev.shape[0] # number of hidden units in the penultimate layer K = H - Hx # (H - Hx) is the number of supervised inputs to top level RBM # for every class, assume it is the correct label and calculate its free energy y = gnp.zeros((K, N)) free_energy = gnp.zeros((N, K)) # we actually calculate -free_energy for k in range(K): # set the current assumed class label y[k, :] = 1.0 # visible unit vector v = gnp.concatenate((y, h_prev)) e_v = gnp.dot(ws_v[-1].T, v) # bias energy term ah = gnp.dot(ws_vh[-1].T, v) + ws_h[-1] e_h = gnp.sum(gnp.log(gnp.exp(ah) + 1.0), axis=0) free_energy[:, k] = e_v + e_h # zero the class labels for next iteration y[:, :] = 0.0 # since these numbers may get pretty small, use the sum-exp trick for converting # these to probabilities pred_y = ( gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]) / gnp.sum(gnp.exp(free_energy - gnp.max(free_energy, axis=1)[:, gnp.newaxis]), axis=1)[:, gnp.newaxis] ) return pred_y
def xent_loss_and_grad(self, Yh, Y_cat): """Cross-entropy loss for predictions Yh given targets Y_cat.""" # Convert from categorical classes to "one-hot" target vectors Y_ind = zeros(Yh.shape) Y_ind[np.arange(Y_ind.shape[0]), Y_cat] = 1.0 # Push one-hot targets vectors to the GPU Y_ind = gp.garray(Y_ind) # Compute softmax and then cross-entropy loss Yh_sm = self.safe_softmax(Yh) L = -gp.sum((Y_ind * gp.log(Yh_sm))) dLdYh = Yh_sm - Y_ind return [L, dLdYh]
def forward_prop(self, X, add_noise=False, compute_loss=False): """ Compute the forward propagation step that maps the input data matrix X into the output. Loss and loss gradient will be computed when compute_loss set to True. Note that the loss is applied on nonlinearity activation, rather than the final output by default, unless loss_after_nonlin is set to True. """ if self.params.dropout > 0 and add_noise: self.dropout_mask = gnp.rand(X.shape[0], X.shape[1]) > self.params.dropout self.inputs = X * self.dropout_mask else: self.inputs = X self.noise_added = add_noise if not self.use_batch_normalization: self.activation = self.inputs.dot(self.params.W) + self.params.b self.output = self.nonlin.forward_prop(self.activation) if self.sparsity_weight > 0: self._sparsity_current = self._sparsity_smoothing * self.output.mean(axis=0) \ + (1 - self._sparsity_smoothing) * self._sparsity_current self._sparsity_objective = (- self.sparsity * gnp.log(self._sparsity_current + 1e-20) \ - (1 - self.sparsity) * gnp.log(1 - self._sparsity_current + 1e-20)).sum() * self.sparsity_weight else: self.activation = self.inputs.dot(self.params.W) self.bn_output = self.bn_layer.forward_prop(self.activation) self.output = self.nonlin.forward_prop(self.bn_output) if compute_loss and self.loss is not None: if self.loss_after_nonlin: self.loss_value, self.loss_grad = self.loss.compute_loss_and_grad( self.output, compute_grad=True) else: self.loss_value, self.loss_grad = self.loss.compute_loss_and_grad( self.activation if not self.use_batch_normalization else self.bn_output, compute_grad=True) self.loss_computed = True return self.output
def costFunction(X, y, theta1, theta2, lam=None, reg=False): # Get the number of training examples: m = np.size(y) # Map labels to binary vectors: y = gpu.garray(binaryMapper(y)).T # Feed it forward: a1, a2, a3 = forwardProp(X, theta1, theta2) # Get the cost without regularization: J = gpu.sum(-(gpu.log(a3) * y) - (gpu.log(1 - a3) * (1 - y))) / m # Add-regularization penalties to the cost (excluding the bias ): if reg == True: J += ((gpu.sum(theta1[:, 1:]**2) + gpu.sum(theta2[:, 1:]**2)) * (lam / (2.0 * m))) print "Regularized Cost: " + str(J) else: print "Unregularized Cost: " + str(J) return J, a1, a2, a3
def loss_mclr(Yh, Y): """Compute mutinomial logistic regression loss for Yh, w.r.t. Y. Values in Yh should probably be network outputs, and each row in Y must be a +1/-1 indicator vector for the target class of a row in Yh. """ obs_count = float(Y.shape[0]) # Get boolean mask for each observation's target class cl_mask = (Y > 0.0) # Compute softmax distribution tranform of Yh sm_sum = gp.sum(gp.exp(Yh), axis=1) P = gp.exp(Yh) / sm_sum[:,gp.newaxis] dL = (P - cl_mask) / obs_count logP = gp.log(P) * cl_mask L = -gp.sum(logP) / obs_count return {'L': L, 'dL': dL}
def loss_mclr(Yh, Y): """Compute mutinomial logistic regression loss for Yh, w.r.t. Y. Values in Yh should probably be network outputs, and each row in Y must be a +1/-1 indicator vector for the target class of a row in Yh. """ obs_count = float(Y.shape[0]) # Get boolean mask for each observation's target class cl_mask = (Y > 0.0) # Compute softmax distribution tranform of Yh sm_sum = gp.sum(gp.exp(Yh), axis=1) P = gp.exp(Yh) / sm_sum[:, gp.newaxis] dL = (P - cl_mask) / obs_count logP = gp.log(P) * cl_mask L = -gp.sum(logP) / obs_count return {'L': L, 'dL': dL}
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray(reshape(x[num_weights_L1:num_weights_L2+num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_softmax = gpu.garray(reshape(x[num_weights_L2+num_weights_L1:shape(x)[0]], (numClasses, l2Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) hidden_activation_L1 = hidden_sum_L1.logistic() hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0) hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = hidden_sum_L2.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis = 0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions,axis = 0) temp = groundTruth*gpu.log(predictions) regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:,1:shape(theta_L2)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 cost = -1*gpu.sum(temp)/numCases + 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L1) + gpu.sum(regularized_penalty_L2)) + 0.5 * lambda_softmax * gpu.sum(theta_softmax*theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_L2 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 del regularized_penalty_L2 gpu.free_reuse_cache() return cost
def costAndGrad(self, data, labels): # forward prop self.hActs[0] = data i = 1 for w, b in self.stack: self.hActs[i] = w.dot(self.hActs[i - 1]) + b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1] - gp.max(self.hActs[-1], axis=0) probs = gp.exp(probs) probs = probs / gp.sum(probs, axis=0) probs += (probs < 1e-8) * (1e-8 - probs) labelMat = np.zeros(probs.shape) labelMat[labels, range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1. / self.mbSize) * gp.sum(labelMat * gp.log(probs)) if not self.train: return cost, None # back prop self.deltas[-1] = probs - labelMat i = len(self.layerSizes) - 1 for w, b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i + 1], True) self.deltas[i] = w.T.dot(self.deltas[i + 1]) * grad i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1. / self.mbSize) * self.deltas[i].dot( self.hActs[i].T) self.grad[i][1] = (1. / self.mbSize) * gp.sum( self.deltas[i], axis=1).reshape(-1, 1) # add gaussian noise # self.grad[i][0] += .01 * gp.randn(self.grad[i][0].shape) # self.grad[i][1] += .01 * gp.randn(self.grad[i][1].shape) return cost, self.grad
def costAndGrad(self,data,labels): # forward prop self.hActs[0] = data i = 1 for w,b in self.stack: self.hActs[i] = w.dot(self.hActs[i-1])+b if i <= len(self.layerSizes): self.hActs[i] = self.activation(self.hActs[i]) i += 1 probs = self.hActs[-1]-gp.max(self.hActs[-1],axis=0) probs = gp.exp(probs) probs = probs/gp.sum(probs,axis=0) probs += (probs < 1e-8)*(1e-8-probs) labelMat = np.zeros(probs.shape) labelMat[labels,range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1./self.mbSize)*gp.sum(labelMat*gp.log(probs)) if not self.train: return cost,None # back prop self.deltas[-1] = probs-labelMat i = len(self.layerSizes)-1 for w,b in reversed(self.stack[1:]): grad = self.activation(self.hActs[i+1], True) self.deltas[i] = w.T.dot(self.deltas[i+1])*grad i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1./self.mbSize)*self.deltas[i].dot(self.hActs[i].T) self.grad[i][1] = (1./self.mbSize)*gp.sum(self.deltas[i],axis=1).reshape(-1,1) # add gaussian noise # self.grad[i][0] += .01 * gp.randn(self.grad[i][0].shape) # self.grad[i][1] += .01 * gp.randn(self.grad[i][1].shape) return cost,self.grad
def costAndGrad(self, data, labels): # forward prop self.hActs[0] = data i = 1 for w, b in self.stack: self.hActs[i] = w.dot(self.hActs[i - 1]) + b if i <= len(self.layerSizes): self.hActs[i] = (1 / 2.) * ( self.hActs[i] + gp.sign(self.hActs[i]) * self.hActs[i]) i += 1 probs = self.hActs[-1] + gp.min(self.hActs[-1], axis=0) probs = gp.exp(probs) probs = probs / gp.sum(probs, axis=0) labelMat = np.zeros(probs.shape) labelMat[labels, range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1. / self.mbSize) * gp.sum(labelMat * gp.log(probs)) if not self.train: return cost, None # back prop self.deltas[-1] = probs - labelMat i = len(self.layerSizes) - 1 for w, b in reversed(self.stack[1:]): self.deltas[i] = w.T.dot(self.deltas[i + 1]) * gp.sign( self.hActs[i + 1]) i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1. / self.mbSize) * self.deltas[i].dot( self.hActs[i].T) self.grad[i][1] = (1. / self.mbSize) * gp.sum( self.deltas[i], axis=1).reshape(-1, 1) return cost, self.grad
def mlpSoftmax1Layer_costfunc(x, *args): numClasses, inputSize, l1Size, lambda_softmax, lambda_hidden, inputs, groundTruth = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) theta_softmax = gpu.garray( reshape(x[num_weights_L1:shape(x)[0]], (numClasses, l1Size))) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 #hidden_activation_L1 = hidden_sum_L1.logistic() hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L1) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) temp = groundTruth * gpu.log(predictions) temp = temp.as_numpy_array() temp[temp == -inf] = -200.0 temp = nan_to_num(temp) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 cost = -1 * sum(temp) / numCases + 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L1)) + 0.5 * lambda_softmax * gpu.sum( theta_softmax * theta_softmax) print 'Multilayer Softmax Cost:', cost del inputs del theta_L1 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_softmax del predictions del temp del regularized_penalty_L1 gpu.free_reuse_cache() return cost
def classify_ensemble(data, probs, batch_size, num_steps): epoch_size = ((len(data) // batch_size) - 1) // num_steps start_time = time.time() costs = 0.0 iters = 0 # for i in range(len(probs)): # probs[i] = probs[i] / np.sum(probs[i]) #probs = tf.nn.softmax(probs) # print(np.sum(probs[0])) # print(np.sum(probs[50])) #print(len(probs[0])) for step, (x, y) in enumerate(reader.ptb_iterator(data, batch_size, num_steps)): # print(step) # print(x) # print(y) # print(probs) #print(probs[0]) cost = -1 * gpu.log(probs[step][0,y[0,0]]) #print(cost) ''' loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(y, [-1])], [tf.ones([batch_size * num_steps], dtype=tf.float64)]) print(loss) cost = tf.reduce_sum(loss) / batch_size print(cost) ''' costs += cost iters += num_steps if step % (epoch_size // 10) == 10: print("%.3f perplexity: %.3f speed: %.0f wps" % (step * 1.0 / epoch_size, gpu.exp(costs / iters), iters * batch_size / (time.time() - start_time))) return gpu.exp(costs / iters)
def grad_costfunc_gpu_ReLU(x, *args): num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta = args num_weights1 = (num_input+1)*num_hidden num_weights2 = (num_hidden+1)*num_output x = gpu.garray(x) inputs = gpu.garray(inputs) weights1 = x[0:num_weights1].reshape((num_hidden,num_input+1)) weights2 = x[num_weights1:shape(x)[0]].reshape((num_output,num_hidden+1)) nData = shape(inputs)[1] data = gpu.concatenate((gpu.ones((1,nData)), inputs), axis = 0) hidden_sum = gpu.dot(weights1, data) hidden_activation = gpu.log(1+hidden_sum.exp()) p_avg = gpu.sum(hidden_activation,axis=1)/nData grad_sparse = -1*sparsityParam/p_avg.as_numpy_array() + (1-sparsityParam)/(1-p_avg.as_numpy_array()) grad_sparse = append(0,grad_sparse) grad_sparse = tile(grad_sparse, (nData, 1)) grad_sparse = gpu.garray(transpose(grad_sparse)) hidden_activation = gpu.concatenate((gpu.ones((1,nData)), hidden_activation), axis = 0) outputs = gpu.dot(weights2, hidden_activation) weights1_grad = gpu.zeros(shape(weights1)) weights2_grad = gpu.zeros(shape(weights2)) p = outputs-inputs weights2_grad += gpu.dot(p, gpu.garray(transpose(hidden_activation.as_numpy_array()))) q_temp = gpu.dot(gpu.garray(transpose(weights2.as_numpy_array())),p) + beta*grad_sparse #q = multiply(multiply(q_temp,hidden_activation),(1-hidden_activation)) q = q_temp*hidden_sum.logistic() delta2 = gpu.dot(q, gpu.garray(transpose(data.as_numpy_array()))) weights1_grad += delta2[1:shape(delta2)[0], :] weights1_grad = weights1_grad/nData weights2_grad = weights2_grad/nData weights1_grad[:,1:shape(weights1_grad)[1]] = weights1_grad[:,1:shape(weights1_grad)[1]] + weights1[:,1:shape(weights1)[1]] * lambda_val weights2_grad[:,1:shape(weights2_grad)[1]] = weights2_grad[:,1:shape(weights2_grad)[1]] + weights2[:,1:shape(weights2)[1]] * lambda_val #weights1_grad = reshape(weights1_grad, num_weights1) weights1_grad = weights1_grad.reshape(num_weights1) #weights2_grad = reshape(weights2_grad, num_weights2) weights2_grad = weights2_grad.reshape(num_weights2) return hstack((weights1_grad.as_numpy_array(),weights2_grad.as_numpy_array()))
def costAndGrad(self,data,labels): # forward prop self.hActs[0] = data i = 1 for w,b in self.stack: self.hActs[i] = w.dot(self.hActs[i-1])+b if i <= len(self.layerSizes): self.hActs[i] = (1/2.)*(self.hActs[i]+gp.sign(self.hActs[i])*self.hActs[i]) i += 1 probs = self.hActs[-1]+gp.min(self.hActs[-1],axis=0) probs = gp.exp(probs) probs = probs/gp.sum(probs,axis=0) labelMat = np.zeros(probs.shape) labelMat[labels,range(self.mbSize)] = 1 labelMat = gp.garray(labelMat) cost = -(1./self.mbSize)*gp.sum(labelMat*gp.log(probs)) if not self.train: return cost,None # back prop self.deltas[-1] = probs-labelMat i = len(self.layerSizes)-1 for w,b in reversed(self.stack[1:]): self.deltas[i] = w.T.dot(self.deltas[i+1])*gp.sign(self.hActs[i+1]) i -= 1 # compute gradients for i in range(len(self.grad)): self.grad[i][0] = (1./self.mbSize)*self.deltas[i].dot(self.hActs[i].T) self.grad[i][1] = (1./self.mbSize)*gp.sum(self.deltas[i],axis=1).reshape(-1,1) return cost,self.grad
def init_using_dataset(self, vis_samples): "Calculates the biases of the base rate RBM using the given samples" epsilon = 1e-2 vis_mean = gp.mean(vis_samples, axis=0) self.base_bias_vis = gp.log((vis_mean + epsilon) / (1 - vis_mean + epsilon))
def invert_output(self, z): return gnp.log(z / (1 - z))
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, l3Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth, dropout_probability = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_L3 = l3Size * (l2Size + 1) num_weights_softmax = numClasses * l3Size #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray(reshape(x[num_weights_L1:num_weights_L2+num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_L3 = gpu.garray(reshape(x[num_weights_L2+num_weights_L1:num_weights_L2+num_weights_L1+num_weights_L3], (l3Size, l2Size + 1))) theta_softmax = gpu.garray(reshape(x[num_weights_L2+num_weights_L1+num_weights_L3:shape(x)[0]], (numClasses, l3Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) theta_L1_grad = gpu.zeros(shape(theta_L1)) theta_L2_grad = gpu.zeros(shape(theta_L2)) theta_L3_grad = gpu.zeros(shape(theta_L3)) dropout_l1 = gpu.garray(bernoulli.rvs(dropout_probability, size = (l1Size+1, numCases))) dropout_l2 = gpu.garray(bernoulli.rvs(dropout_probability, size = (l2Size+1, numCases))) dropout_l3 = gpu.garray(bernoulli.rvs(dropout_probability, size = (l3Size, numCases))) inputs = gpu.concatenate((gpu.ones((1,numCases)), inputs), axis = 0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1>0) hidden_activation_L1 = hidden_sum_L1*relu_mask_hidden1 hidden_derivative_L1 = relu_mask_hidden1 #hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0) hidden_derivative_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_derivative_L1), axis=0) hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0) * dropout_l1 hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) #hidden_activation_L2 = gpu.log(1+hidden_sum_L2.exp()) relu_mask_hidden2 = gpu.ones(shape(hidden_sum_L2)) * (hidden_sum_L2>0) hidden_activation_L2 = hidden_sum_L2*relu_mask_hidden2 hidden_derivative_L2 = relu_mask_hidden2 #hidden_activation_L2 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L2), axis=0) hidden_derivative_L2 = gpu.concatenate((gpu.ones((1,numCases)), hidden_derivative_L2), axis=0) hidden_activation_L2 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L2), axis=0) * dropout_l2 hidden_sum_L3 = gpu.dot(theta_L3, hidden_activation_L2) #hidden_activation_L3 = gpu.log(1+hidden_sum_L3.exp()) relu_mask_hidden3 = gpu.ones(shape(hidden_sum_L3)) * (hidden_sum_L3>0) #hidden_activation_L3 = hidden_sum_L3*relu_mask_hidden3 hidden_derivative_L3 = relu_mask_hidden3 hidden_activation_L3 = hidden_sum_L3*relu_mask_hidden3 * dropout_l3 #hidden_activation_L3 = hidden_sum_L3.logistic() * dropout_l3 hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L3) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis = 0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions,axis = 0) pred = predictions.argmax(axis=0) + 1 accuracy = mean(pred == labels) * 100 temp = groundTruth*gpu.log(predictions) temp = temp.as_numpy_array() temp[temp==-inf] = -200.0 temp = nan_to_num(temp) regularized_penalty_L1 = theta_L1[:,1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:,1:shape(theta_L2)[1]] regularized_penalty_L3 = theta_L3[:,1:shape(theta_L3)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 regularized_penalty_L3 = regularized_penalty_L3 * regularized_penalty_L3 pred_cost = -1*sum(temp)/numCases l2norm_cost = 0.5 * lambda_hidden*(gpu.sum(regularized_penalty_L3) + gpu.sum(regularized_penalty_L2) + gpu.sum(regularized_penalty_L1)) + 0.5 * lambda_softmax * gpu.sum(theta_softmax*theta_softmax) #l2norm_cost = 0 cost = pred_cost + l2norm_cost print 'Prediction Accuracy: ', accuracy, '%' print 'Multilayer Softmax Prediction Cost: ', pred_cost print 'Multilayer Softmax L2 Normalisation Cost: ', l2norm_cost print 'Multilayer Softmax Cost: ', cost print '--------------------------------------------------------------------' softmax_imd = groundTruth - predictions #theta_softmax_grad = -1*gpu.dot(softmax_imd, gpu.garray(transpose(hidden_activation_L3.as_numpy_array())))/numCases theta_softmax_grad = -1*gpu.dot(softmax_imd, gpu.garray(transpose(hidden_activation_L3.as_numpy_array())))/numCases + lambda_softmax * theta_softmax deltaOut = -softmax_imd delta_L3_imd = gpu.dot(gpu.garray(transpose(theta_softmax.as_numpy_array())), deltaOut) delta_L3_imd2 = delta_L3_imd*hidden_derivative_L3 #delta_L3_imd2 = (delta_L3_imd * hidden_activation_L3) * (1-hidden_activation_L3) delta_L3 = gpu.dot(delta_L3_imd2, gpu.garray(transpose(hidden_activation_L2.as_numpy_array()))) theta_L3_grad += delta_L3 delta_L2_imd = gpu.dot(gpu.garray(transpose(theta_L3.as_numpy_array())), delta_L3_imd2) delta_L2_imd2 = delta_L2_imd*hidden_derivative_L2 delta_L2_imd2 = delta_L2_imd2[1:shape(delta_L2_imd2)[0]+1, :] delta_L2 = gpu.dot(delta_L2_imd2, gpu.garray(transpose(hidden_activation_L1.as_numpy_array()))) theta_L2_grad += delta_L2 delta_L1_imd = gpu.dot(gpu.garray(transpose(theta_L2.as_numpy_array())), delta_L2_imd2) delta_L1_imd2 = delta_L1_imd*hidden_derivative_L1 delta_L1_imd2 = delta_L1_imd2[1:shape(delta_L1_imd2)[0]+1, :] delta_L1 = gpu.dot(delta_L1_imd2, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta_L1 theta_L1_grad = theta_L1_grad/numCases theta_L2_grad = theta_L2_grad/numCases theta_L3_grad = theta_L3_grad/numCases theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] + theta_L1[:, 1: shape(theta_L1)[1]] * lambda_hidden theta_L2_grad[:, 1:shape(theta_L2_grad)[1]] = theta_L2_grad[:, 1:shape(theta_L2_grad)[1]] + theta_L2[:, 1: shape(theta_L2)[1]] * lambda_hidden theta_L3_grad[:, 1:shape(theta_L3_grad)[1]] = theta_L3_grad[:, 1:shape(theta_L3_grad)[1]] + theta_L3[:, 1: shape(theta_L3)[1]] * lambda_hidden theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) theta_L2_grad = reshape(theta_L2_grad.as_numpy_array(), num_weights_L2) theta_L3_grad = reshape(theta_L3_grad.as_numpy_array(), num_weights_L3) theta_softmax_grad = reshape(theta_softmax_grad.as_numpy_array(), num_weights_softmax) del inputs del theta_L1 del theta_L2 del theta_L3 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_activation_L3 del hidden_sum_L3 del hidden_sum_softmax del predictions del temp del softmax_imd del deltaOut del delta_L3_imd del delta_L3_imd2 del delta_L3 del delta_L2_imd del delta_L2_imd2 del delta_L2 del delta_L1_imd del delta_L1_imd2 del delta_L1 #del regularized_penalty_L1 #del regularized_penalty_L2 gpu.free_reuse_cache() return cost, hstack((theta_L1_grad,theta_L2_grad,theta_L3_grad,theta_softmax_grad))
def score_softmax(y_target,y_predicted): assert(type(y_target) == type(y_predicted)) if type(y_target) is g.garray: return g.sum(y_target * g.log(y_predicted + 1e-30)) else: return np.sum(y_target * np.log(y_predicted + 1e-300))
def invert_output(self, z): return 0.5 * gnp.log((1+z) / (1-z))
def mlpSoftmax_costfunc(x, *args): numClasses, inputSize, l1Size, l2Size, l3Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth, dropout_probability = args numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_L3 = l3Size * (l2Size + 1) num_weights_softmax = numClasses * l3Size #x = gpu.garray(x) inputs = gpu.garray(inputs) theta_L1 = gpu.garray(reshape(x[0:num_weights_L1], (l1Size, inputSize + 1))) #theta_L1 = x[0:num_weights_L1].reshape((l1Size, inputSize + 1)) #print numClasses, l2Size theta_L2 = gpu.garray( reshape(x[num_weights_L1:num_weights_L2 + num_weights_L1], (l2Size, l1Size + 1))) #theta_L2 = x[num_weights_L1:num_weights_L2+num_weights_L1].reshape((l2Size, l1Size + 1)) theta_L3 = gpu.garray( reshape( x[num_weights_L2 + num_weights_L1:num_weights_L2 + num_weights_L1 + num_weights_L3], (l3Size, l2Size + 1))) theta_softmax = gpu.garray( reshape( x[num_weights_L2 + num_weights_L1 + num_weights_L3:shape(x)[0]], (numClasses, l3Size))) #theta_softmax = x[num_weights_L2+num_weights_L1:shape(x)[0]].reshape((numClasses, l2Size)) theta_L1_grad = gpu.zeros(shape(theta_L1)) theta_L2_grad = gpu.zeros(shape(theta_L2)) theta_L3_grad = gpu.zeros(shape(theta_L3)) dropout_l1 = gpu.garray( bernoulli.rvs(dropout_probability, size=(l1Size + 1, numCases))) dropout_l2 = gpu.garray( bernoulli.rvs(dropout_probability, size=(l2Size + 1, numCases))) dropout_l3 = gpu.garray( bernoulli.rvs(dropout_probability, size=(l3Size, numCases))) inputs = gpu.concatenate((gpu.ones((1, numCases)), inputs), axis=0) hidden_sum_L1 = gpu.dot(theta_L1, inputs) #hidden_activation_L1 = gpu.log(1+hidden_sum_L1.exp()) relu_mask_hidden1 = gpu.ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 hidden_derivative_L1 = relu_mask_hidden1 #hidden_activation_L1 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L1), axis=0) hidden_derivative_L1 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_derivative_L1), axis=0) hidden_activation_L1 = gpu.concatenate( (gpu.ones((1, numCases)), hidden_activation_L1), axis=0) * dropout_l1 hidden_sum_L2 = gpu.dot(theta_L2, hidden_activation_L1) #hidden_activation_L2 = gpu.log(1+hidden_sum_L2.exp()) relu_mask_hidden2 = gpu.ones(shape(hidden_sum_L2)) * (hidden_sum_L2 > 0) hidden_activation_L2 = hidden_sum_L2 * relu_mask_hidden2 hidden_derivative_L2 = relu_mask_hidden2 #hidden_activation_L2 = gpu.concatenate((gpu.ones((1,numCases)), hidden_activation_L2), axis=0) hidden_derivative_L2 = gpu.concatenate((gpu.ones( (1, numCases)), hidden_derivative_L2), axis=0) hidden_activation_L2 = gpu.concatenate( (gpu.ones((1, numCases)), hidden_activation_L2), axis=0) * dropout_l2 hidden_sum_L3 = gpu.dot(theta_L3, hidden_activation_L2) #hidden_activation_L3 = gpu.log(1+hidden_sum_L3.exp()) relu_mask_hidden3 = gpu.ones(shape(hidden_sum_L3)) * (hidden_sum_L3 > 0) #hidden_activation_L3 = hidden_sum_L3*relu_mask_hidden3 hidden_derivative_L3 = relu_mask_hidden3 hidden_activation_L3 = hidden_sum_L3 * relu_mask_hidden3 * dropout_l3 #hidden_activation_L3 = hidden_sum_L3.logistic() * dropout_l3 hidden_sum_softmax = gpu.dot(theta_softmax, hidden_activation_L3) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = hidden_sum_softmax.exp() predictions = predictions / gpu.sum(predictions, axis=0) pred = predictions.argmax(axis=0) + 1 accuracy = mean(pred == labels) * 100 temp = groundTruth * gpu.log(predictions) temp = temp.as_numpy_array() temp[temp == -inf] = -200.0 temp = nan_to_num(temp) regularized_penalty_L1 = theta_L1[:, 1:shape(theta_L1)[1]] regularized_penalty_L2 = theta_L2[:, 1:shape(theta_L2)[1]] regularized_penalty_L3 = theta_L3[:, 1:shape(theta_L3)[1]] regularized_penalty_L1 = regularized_penalty_L1 * regularized_penalty_L1 regularized_penalty_L2 = regularized_penalty_L2 * regularized_penalty_L2 regularized_penalty_L3 = regularized_penalty_L3 * regularized_penalty_L3 pred_cost = -1 * sum(temp) / numCases l2norm_cost = 0.5 * lambda_hidden * ( gpu.sum(regularized_penalty_L3) + gpu.sum(regularized_penalty_L2) + gpu.sum(regularized_penalty_L1)) + 0.5 * lambda_softmax * gpu.sum( theta_softmax * theta_softmax) #l2norm_cost = 0 cost = pred_cost + l2norm_cost print 'Prediction Accuracy: ', accuracy, '%' print 'Multilayer Softmax Prediction Cost: ', pred_cost print 'Multilayer Softmax L2 Normalisation Cost: ', l2norm_cost print 'Multilayer Softmax Cost: ', cost print '--------------------------------------------------------------------' softmax_imd = groundTruth - predictions #theta_softmax_grad = -1*gpu.dot(softmax_imd, gpu.garray(transpose(hidden_activation_L3.as_numpy_array())))/numCases theta_softmax_grad = -1 * gpu.dot( softmax_imd, gpu.garray(transpose(hidden_activation_L3.as_numpy_array())) ) / numCases + lambda_softmax * theta_softmax deltaOut = -softmax_imd delta_L3_imd = gpu.dot( gpu.garray(transpose(theta_softmax.as_numpy_array())), deltaOut) delta_L3_imd2 = delta_L3_imd * hidden_derivative_L3 #delta_L3_imd2 = (delta_L3_imd * hidden_activation_L3) * (1-hidden_activation_L3) delta_L3 = gpu.dot( delta_L3_imd2, gpu.garray(transpose(hidden_activation_L2.as_numpy_array()))) theta_L3_grad += delta_L3 delta_L2_imd = gpu.dot(gpu.garray(transpose(theta_L3.as_numpy_array())), delta_L3_imd2) delta_L2_imd2 = delta_L2_imd * hidden_derivative_L2 delta_L2_imd2 = delta_L2_imd2[1:shape(delta_L2_imd2)[0] + 1, :] delta_L2 = gpu.dot( delta_L2_imd2, gpu.garray(transpose(hidden_activation_L1.as_numpy_array()))) theta_L2_grad += delta_L2 delta_L1_imd = gpu.dot(gpu.garray(transpose(theta_L2.as_numpy_array())), delta_L2_imd2) delta_L1_imd2 = delta_L1_imd * hidden_derivative_L1 delta_L1_imd2 = delta_L1_imd2[1:shape(delta_L1_imd2)[0] + 1, :] delta_L1 = gpu.dot(delta_L1_imd2, gpu.garray(transpose(inputs.as_numpy_array()))) theta_L1_grad += delta_L1 theta_L1_grad = theta_L1_grad / numCases theta_L2_grad = theta_L2_grad / numCases theta_L3_grad = theta_L3_grad / numCases theta_L1_grad[:, 1:shape(theta_L1_grad)[1]] = theta_L1_grad[:, 1:shape( theta_L1_grad)[1]] + theta_L1[:, 1:shape(theta_L1)[1]] * lambda_hidden theta_L2_grad[:, 1:shape(theta_L2_grad)[1]] = theta_L2_grad[:, 1:shape( theta_L2_grad)[1]] + theta_L2[:, 1:shape(theta_L2)[1]] * lambda_hidden theta_L3_grad[:, 1:shape(theta_L3_grad)[1]] = theta_L3_grad[:, 1:shape( theta_L3_grad)[1]] + theta_L3[:, 1:shape(theta_L3)[1]] * lambda_hidden theta_L1_grad = reshape(theta_L1_grad.as_numpy_array(), num_weights_L1) theta_L2_grad = reshape(theta_L2_grad.as_numpy_array(), num_weights_L2) theta_L3_grad = reshape(theta_L3_grad.as_numpy_array(), num_weights_L3) theta_softmax_grad = reshape(theta_softmax_grad.as_numpy_array(), num_weights_softmax) del inputs del theta_L1 del theta_L2 del theta_L3 del theta_softmax del hidden_sum_L1 del hidden_activation_L1 del hidden_sum_L2 del hidden_activation_L2 del hidden_activation_L3 del hidden_sum_L3 del hidden_sum_softmax del predictions del temp del softmax_imd del deltaOut del delta_L3_imd del delta_L3_imd2 del delta_L3 del delta_L2_imd del delta_L2_imd2 del delta_L2 del delta_L1_imd del delta_L1_imd2 del delta_L1 #del regularized_penalty_L1 #del regularized_penalty_L2 gpu.free_reuse_cache() return cost, hstack( (theta_L1_grad, theta_L2_grad, theta_L3_grad, theta_softmax_grad))
def log_exp_sum(x, axis=1): x_max = x.max(axis=axis) if isinstance(x, gnp.garray): return (x_max + gnp.log(gnp.exp(x - x_max[:,gnp.newaxis]).sum(axis=axis))).asarray() else: return x_max + np.log(np.exp(x - x_max[:,np.newaxis]).sum(axis=axis))
def unigram_partition(data_path, num_ensembles, model_name, method = 'none', train = True, random_training_order = False, reverse_order = False): algo_name = 'unigram' + '_' + 'partition' raw_data = reader.ptb_raw_data(data_path) train_data, valid_data, test_data, _, word_to_id = raw_data if reverse_order: train_data.reverse() valid_data.reverse() test_data.reverse() algo_name = 'reverse ' + algo_name eos_id = word_to_id['<eos>'] case_weight_length = len(train_data)-1 train_case_weights = np.repeat(1.0/case_weight_length, case_weight_length).tolist() train_sentence_list = reader.get_sentence_list(train_data, eos_id, reverse_order) if random_training_order: #train_sentence_list = reader.get_sentence_list(train_data, eos_id) perm = range(len(train_sentence_list)) np.random.shuffle(perm) train_data = [] for idx in perm: train_data += train_sentence_list[idx] train_sentence_list = reader.get_sentence_list(train_data, eos_id, reverse_order) num_sent = len(train_sentence_list) train_sentence_weights = np.repeat(1.0/num_sent, num_sent).tolist() new_train_data = train_data FLAGS.model = model_name config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 alpha_t_list = [] full_test_set_logits = [] for i in range(len(test_data)-1): full_test_set_logits.append(np.zeros((1,eval_config.vocab_size))) sentence_starters = [] id_to_sentence_num_dict = {} for i in range(num_sent): if reverse_order: desired_id = train_sentence_list[i][-1] else: desired_id = train_sentence_list[i][0] if desired_id in id_to_sentence_num_dict: id_to_sentence_num_dict[desired_id].append(i) else: id_to_sentence_num_dict[desired_id] = [i] sentence_starters.append(desired_id) id_to_model = {} for idx in sentence_starters: id_to_model[idx] = [1] id_to_weight = {} gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5) for iii in range(num_ensembles): with tf.Graph().as_default(), tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) m = PTBModel(is_training=True, config=config) sess.close() with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config) mtest = PTBModel(is_training=False, config=eval_config) m2 = PTBModel(is_training=False, config=eval_config) tf.initialize_all_variables().run() saver = tf.train.Saver() if iii > 0: np.savetxt(checkpoint_dir + 'test_set_probs_no_alpha.out', np.squeeze(test_set_probs_no_alpha), delimiter = ',') if random_training_order: new_folder = 'random training order ' + algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1) else: new_folder = algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1) checkpoint_dir = 'simple-examples/ckpt/' + new_folder + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if iii > 0: for k,v in id_to_sentence_num_dict.items(): total_wt = 0 num_sentences = len(v) for sent_idx in v: total_wt += train_sentence_weights[sent_idx] id_to_weight[k] = total_wt/num_sentences sorted_id_to_weight = sorted(id_to_weight.items(), key = operator.itemgetter(1), reverse = True) np.savetxt(checkpoint_dir + 'sorted_id_to_weight', sorted_id_to_weight, delimiter = ',') new_train_data = [] i = 0 sent_included = 0 while sent_included < np.floor(num_sent/2): start_key = sorted_id_to_weight[i][0] sentence_additions = id_to_sentence_num_dict[start_key] for idx in sentence_additions: new_train_data += train_sentence_list[idx] sent_included += 1 id_to_model[start_key].append(iii + 1) i += 1 if train: np.savetxt(checkpoint_dir + 'train_case_weights.out', train_case_weights, delimiter = ',') np.savetxt(checkpoint_dir + 'train_sentence_weights.out', train_sentence_weights, delimiter = ',') for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, new_train_data, m.train_op, verbose=False) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op()) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) if (i+1) % 5 == 0 or (i+1) == config.max_max_epoch: saver.save(session, checkpoint_dir + 'model.ckpt', global_step = i+1) else: ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(session, ckpt.model_checkpoint_path) if train: case_scores = output_training_set_error_for_boosting(session, m2, train_data, tf.no_op()) score = sum(case_scores) norm = len(case_scores) * -1 * gpu.log(float(1.0) / eval_config.vocab_size) epsilon_t = (1 - (norm - score) / norm) / 2.0 alpha_t = 0.5 * gpu.log((1 - epsilon_t)/ epsilon_t) alpha_t_list.append(alpha_t) if iii == 0: shutil.rmtree('simple-examples/ckpt/' + 'random training order ' + algo_name + '_' + model_name + '/' + 'alpha_t.out', ignore_errors = True) with open('simple-examples/ckpt/' + 'random training order ' + algo_name + '_' + model_name + '/' + 'alpha_t.out', 'ab') as f: f.write(str(alpha_t)) f.write(',') train_case_weights = gpu.sqrt((1 - epsilon_t)/ epsilon_t) * np.multiply(train_case_weights, np.asarray(case_scores)) train_case_weights = np.ravel(normalize(np.asarray(train_case_weights).reshape(1,-1), norm = 'l1')) if method == 'stddev': new_train_case_weights = reject_outliers(train_case_weights) elif method == 'sqrt': new_train_case_weights = sqrt_norm(train_case_weights) else: new_train_case_weights = train_case_weights start_idx = 0 for i in range(len(train_sentence_list)): this_sentence_length = len(train_sentence_list[i]) sentence_tokens = [v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)] if len(sentence_tokens) == 0: this_sentence_weights[i] = 0 else: train_sentence_weights[i] = np.mean([v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)]) start_idx += this_sentence_length train_sentence_weights = np.ravel(normalize(np.asarray(train_sentence_weights).reshape(1,-1), norm = 'l1')) for k,v in id_to_sentence_num_dict.items(): total_wt = 0 num_sentences = len(v) for sent_idx in v: total_wt += train_sentence_weights[sent_idx] id_to_weight[k] = total_wt/num_sentences test_set_probs = output_test_set_probs(session, mtest, test_data, tf.no_op(), partition = True) test_set_probs_no_alpha = test_set_probs np.savetxt(checkpoint_dir + 'test_set_probs_no_alpha.out', np.squeeze(test_set_probs_no_alpha), delimiter = ',') train_set_probs = output_test_set_probs(session, m2, train_data, tf.no_op(), partition = True) train_set_probs_no_alpha = train_set_probs np.savetxt(checkpoint_dir + 'train_set_probs_no_alpha.out', np.squeeze(train_set_probs_no_alpha), delimiter = ',') test_perplexity = run_epoch(session, mtest, test_data, tf.no_op()) print("Test Perplexity: %.3f" % test_perplexity) if random_training_order: new_folder = 'random training order ' + algo_name + '_' + model_name else: new_folder = algo_name + '_' + model_name checkpoint_dir = 'simple-examples/ckpt/' + new_folder + '/' with open(checkpoint_dir + 'id_to_model.out', 'w') as f: for k,v in id_to_model.items(): f.write(str(k) + ',' + ','.join(str(id_to_model[k])) + '\n') with open(checkpoint_dir + 'id_to_sent_num.out', 'w') as ff: for k,v in id_to_sentence_num_dict.items(): ff.write(str(k) + ',' + ','.join(str(id_to_sentence_num_dict[k])) + '\n') print('Test PPL: ' + str(evaluate_unigram_partition(data = test_data, batch_size = 1, num_steps = 1, num_ensembles = num_ensembles, eos_id = eos_id, fp = 'simple-examples/ckpt/random training order unigram_partition_small/', probs_fn = 'test_set_probs_no_alpha.out'))) print('Train PPL: ' + str(evaluate_unigram_partition(data = train_data, batch_size = 1, num_steps = 1, num_ensembles = num_ensembles, eos_id = eos_id, fp = 'simple-examples/ckpt/random training order unigram_partition_small/', probs_fn = 'train_set_probs_no_alpha.out')))
def invert_output(self, z): return 0.5 * gnp.log((1 + z) / (1 - z))
def safe_log(x): return gnp.log(x + _SMALL_CONSTANT)
def ABISS(data_path, num_ensembles, model_name, method = 'stddev', train = True, random_training_order = False): algo_name = method + ' ' + 'ABISS' raw_data = reader.ptb_raw_data(data_path) train_data, valid_data, test_data, _, word_to_id = raw_data eos_id = word_to_id['<eos>'] case_weight_length = len(train_data)-1 train_case_weights = np.repeat(1.0/case_weight_length, case_weight_length).tolist() train_sentence_list = reader.get_sentence_list(train_data, eos_id) if random_training_order: perm = range(len(train_sentence_list)) np.random.shuffle(perm) train_data = [] for idx in perm: train_data += train_sentence_list[idx] train_sentence_list = reader.get_sentence_list(train_data, eos_id) num_sent = len(train_sentence_list) train_sentence_weights = np.repeat(1.0/num_sent, num_sent).tolist() new_train_data = reader.weighted_sentence_selection(train_sentence_list, train_sentence_weights, random_training_order) FLAGS.model = model_name config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 alpha_t_list = [] full_test_set_logits = [] for i in range(len(test_data)-1): full_test_set_logits.append(np.zeros((1,eval_config.vocab_size))) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.33) for iii in range(num_ensembles): with tf.Graph().as_default(), tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) m = PTBModel(is_training=True, config=config) sess.close() with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config) mtest = PTBModel(is_training=False, config=eval_config) #config.batch_size = 1 m2 = PTBModel(is_training=False, config=eval_config) tf.initialize_all_variables().run() saver = tf.train.Saver() if random_training_order: new_folder = 'random training order' + algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1) else: new_folder = algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1) checkpoint_dir = 'simple-examples/ckpt/' + new_folder + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if train: np.savetxt(checkpoint_dir + 'train_case_weights.out', train_case_weights, delimiter = ',') np.savetxt(checkpoint_dir + 'train_sentence_weights.out', train_sentence_weights, delimiter = ',') for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, new_train_data, m.train_op, verbose=False) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op()) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) if (i+1) % 5 == 0 or (i+1) == config.max_max_epoch: saver.save(session, checkpoint_dir + 'model.ckpt', global_step = i+1) else: ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(session, ckpt.model_checkpoint_path) case_scores = output_training_set_error_for_boosting(session, m2, train_data, tf.no_op()) score = sum(case_scores) norm = len(case_scores) * -1 * gpu.log(float(1.0) / eval_config.vocab_size) epsilon_t = (1 - (norm - score) / norm) / 2.0 alpha_t = 0.5 * gpu.log((1 - epsilon_t)/ epsilon_t) alpha_t_list.append(alpha_t) if iii == 0: shutil.rmtree('simple-examples/ckpt/' + algo_name + '_' + model_name + '/' + 'alpha_t.out', ignore_errors = True) with open('simple-examples/ckpt/' + algo_name + '_' + model_name + '/' + 'alpha_t.out', 'ab') as f: f.write(str(alpha_t)) f.write(',') train_case_weights = gpu.sqrt((1 - epsilon_t)/ epsilon_t) * np.multiply(train_case_weights, np.asarray(case_scores)) train_case_weights = np.ravel(normalize(np.asarray(train_case_weights).reshape(1,-1), norm = 'l1')) if method == 'stddev': new_train_case_weights = reject_outliers(train_case_weights) elif method == 'sqrt': new_train_case_weights = sqrt_norm(train_case_weights) start_idx = 0 for i in range(len(train_sentence_list)): this_sentence_length = len(train_sentence_list[i]) sentence_tokens = [v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)] if len(sentence_tokens) == 0: this_sentence_weights[i] = 0 else: train_sentence_weights[i] = np.mean([v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)]) start_idx += this_sentence_length train_sentence_weights = np.ravel(normalize(np.asarray(train_sentence_weights).reshape(1,-1), norm = 'l1')) new_train_data = reader.weighted_sentence_selection(train_sentence_list, train_sentence_weights, random_training_order) test_set_probs = output_test_set_probs(session, mtest, test_data, tf.no_op()) for i in range(len(test_set_probs)): test_set_probs[i] = test_set_probs[i] * alpha_t full_test_set_logits[i] += test_set_probs[i] test_perplexity = run_epoch(session, mtest, test_data, tf.no_op()) print("Test Perplexity: %.3f" % test_perplexity) alpha_t_sum = np.sum(alpha_t_list) for i in range(len(full_test_set_logits)): full_test_set_logits[i] = full_test_set_logits[i] / alpha_t_sum ensemble_perplexity = classify_ensemble(test_data, full_test_set_logits, 1, 1) print(ensemble_perplexity)
def evaluate_unigram_partition(data, batch_size, num_steps, num_ensembles, eos_id, fp = 'simple-examples/ckpt/random training order unigram_partition_small/', probs_fn = 'test_set_probs_no_alpha.out'): epoch_size = ((len(data) // batch_size) - 1) // num_steps start_time = time.time() costs = 0.0 iters = 0 full_probs = [] for i in range(num_ensembles): print(i) #full_probs[i] = np.loadtxt(fp + 'ensemble' + str(i+1) + '/test_set_probs_no_alpha.out', delimiter = ',') full_probs.append(np.asarray(pd.read_csv(fp + 'ensemble' + str(i+1) + '/' + probs_fn, delimiter = ',', header = None))) print(np.shape(full_probs[i])) # for ii in range(len(full_probs[i])): # full_probs[i][ii] = full_probs[i][ii] / np.sum(full_probs[i][ii]) print('reading in probs done') id_to_model = {} with open(fp + 'id_to_model.out', 'rb') as f: csv_reader = csv.reader(f, delimiter = ',', quotechar = '|') for row in csv_reader: row_list = [x for x in row if (x != '[' and x != ']' and x != '' and x != ' ')] #print(row_list) #row_list = row.split(',') row_list = [int(i) for i in row_list] id_to_model[row_list[0]] = row_list[1:len(row_list)] print('reading in id_to_model done') #print(id_to_model[1344]) #probs = tf.nn.softmax(probs) # print(np.sum(probs[0])) # print(np.sum(probs[50])) #print(len(probs[0])) next_is_start_of_sentence = True flaggg = True #sent_list = reader.get_sentence_list(data = data, eos_id = eos_id) for step, (x, y) in enumerate(reader.ptb_iterator(data, batch_size, num_steps)): if next_is_start_of_sentence: x = x[0,0] if x in id_to_model: models_included = id_to_model[x] coef = 1 else: models_included = [1,2,3,4,5,6,7,8,9] coef = 1 if x == eos_id: #cost = -1 * gpu.log(full_probs[0][step]) models_included = [1] coef = 1 next_is_start_of_sentence = True else: next_is_start_of_sentence = False #coef = 0.5 #models_included = id_to_model[x] probs = 0 denom = 0 for m in models_included: if m == 1: probs += full_probs[m-1][step] denom += 1 else: #coef = 0.5 probs += coef*full_probs[m-1][step] denom += coef probs = probs / float(denom) cost = -1 * gpu.log(probs) # print(step) # print(x) # print(y) # print(probs) #print(probs[0]) #cost = -1 * gpu.log(probs[step][0,y[0,0]]) #print(cost) ''' loss = tf.nn.seq2seq.sequence_loss_by_example( [logits], [tf.reshape(y, [-1])], [tf.ones([batch_size * num_steps], dtype=tf.float64)]) print(loss) cost = tf.reduce_sum(loss) / batch_size print(cost) ''' costs += cost iters += num_steps if step % (epoch_size // 10) == 10: print("%.3f perplexity: %.3f speed: %.0f wps" % (step * 1.0 / epoch_size, gpu.exp(costs / iters), iters * batch_size / (time.time() - start_time))) return gpu.exp(costs / iters)
def loss(self, Y, Z, A=None): # cross entropy loss return -(Z * gnp.log(Y + const.epsilon)).sum()
def H(X): from gnumpy import log return -(X*log(X+1e-10) + (1-X)*log(1-X+1e-10))
def softmax_old(x): y = gp.max(x, axis=1)[:, gp.newaxis] logsumexp = y + gp.log(gp.sum((gp.exp(x - y)), axis=1))[:, gp.newaxis] return gp.exp(x - logsumexp)
def compute_not_weighted_loss_and_grad(self, pred, compute_grad=False): y = gnp.exp(pred - pred.max(axis=1)[:,gnp.newaxis]) y = y / y.sum(axis=1)[:,gnp.newaxis] return -(self.target * gnp.log(y + _SMALL_CONSTANT)).sum(), y - self.target
def score_softmax(y_target, y_predicted): assert (type(y_target) == type(y_predicted)) if type(y_target) is g.garray: return g.sum(y_target * g.log(y_predicted + 1e-30)) else: return np.sum(y_target * np.log(y_predicted + 1e-300))