def __init__(self, config, name): super(AE, self).__init__(config, name) #dimension of hidden layer self.hDim = int(self.readField(config, name, "hidden_dimension")) #dimension of visible layer self.vDim = int(self.readField(config, name, "visible_dimension")) #baise for hidden layer if self.hDim>0: self.b1 = gp.zeros(self.hDim) #biase for visible layer if self.vDim>0: self.b2 = gp.zeros(self.vDim) #init weight: uniform between +-sqrt(6)/sqrt(v+h+1) if self.hDim*self.vDim>0: gp.seed_rand() r=gp.sqrt(6)/gp.sqrt(self.hDim+self.vDim+1) self.W1 = gp.randn(self.vDim, self.hDim) * 2 * r - r self.W2 = gp.randn(self.hDim, self.vDim) * 2 * r - r self.initUpdate() self.initHyperParam(config, name)
def initRandom(self): gp.seed_rand() r = gp.sqrt(6) / gp.sqrt(self.hDim + self.vDim + 1) self.W1 = gp.randn(self.vDim, self.hDim) * 2 * r - r self.W2 = gp.randn(self.hDim, self.vDim) * 2 * r - r self.initUpdate() self.initHyperParam(self.config, self.name)
def epoch_update(self, input_batch, output_batch): this_batch_size = input_batch.shape[0] if not isinstance(input_batch, gnp.garray): input_batch = gnp.garray(input_batch) if not isinstance(output_batch, gnp.garray): output_batch = gnp.garray(output_batch) error_residual, output_result, error = self.feed_back(input_batch, output_batch) for i, (w_grad, b_grad) in enumerate(self.gradients(self.state, error_residual)): self.weight_grads_l2_norm[i] += (w_grad/this_batch_size - self.l2*self.weights[i]) ** 2 self.bias_gradis_l2_norm[i] += (b_grad/this_batch_size) ** 2 w_factor = 1 / gnp.sqrt(self.weight_grads_l2_norm[i]) b_factor = 1 / gnp.sqrt(self.bias_gradis_l2_norm[i]) self.weight_grads[i] = self.learning_rate * w_factor * ( w_grad/this_batch_size - self.l2*self.weights[i]) self.bias_grads[i] = (self.learning_rate*b_factor/this_batch_size) * b_grad for i in range(len(self.weights)): self.weights[i] += self.weight_grads[i] self.biases[i] += self.bias_grads[i] return error, output_result
def __init__(self, config, name): super(AE, self).__init__(config, name) #dimension of hidden layer self.hDim = int(self.readField(config, name, "hidden_dimension")) #dimension of visible layer self.vDim = int(self.readField(config, name, "visible_dimension")) #baise for hidden layer if self.hDim > 0: self.b1 = gp.zeros(self.hDim) #biase for visible layer if self.vDim > 0: self.b2 = gp.zeros(self.vDim) #init weight: uniform between +-sqrt(6)/sqrt(v+h+1) if self.hDim * self.vDim > 0: gp.seed_rand() r = gp.sqrt(6) / gp.sqrt(self.hDim + self.vDim + 1) self.W1 = gp.randn(self.vDim, self.hDim) * 2 * r - r self.W2 = gp.randn(self.hDim, self.vDim) * 2 * r - r self.initUpdate() self.initHyperParam(config, name)
def initParams(self): # crude way of random initialization (random seed) for parameters import time self.seed = int(time.time()) % 100000 # for tt in range(self.seed): gp.rand() sizes = [self.inputDim] + self.layerSizes + [self.outputDim] scales = [ gp.sqrt(6) / gp.sqrt(n + m) for n, m in zip(sizes[:-1], sizes[1:]) ] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s, self.mbSize)) for s in sizes] if self.train: self.deltas = [gp.empty((s, self.mbSize)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape), gp.empty(b.shape)] for w, b in self.stack] for tt in range(self.seed): gp.rand() self.stack = [[ ws[0] + .01 * gp.randn(ws[0].shape), ws[1] + .01 * gp.randn(ws[1].shape) ] for ws in self.stack]
def _updateWeights(self, dEdW): if self.gpu: if self.gradientClip > 0.0: self.dEdWnorm = gpu.sqrt(gpu.sum(dEdW**2)) if self.dEdWnorm > self.gradientClip: dEdW *= self.gradientClip / self.dEdWnorm if self.learningRate > 0.0: self.lastdW = -self.learningRate * dEdW + \ self.momentum * self.lastdW self.W += self.lastdW if self.weightRegConst > 0.0: a = self.learningRate * self.weightRegConst self.W -= a * self.W if self.weightClip > 0.0: self.Wnorm = gpu.sqrt(gpu.sum(self.W**2)) if self.Wnorm > self.weightClip: self.W *= self.weightClip / self.Wnorm else: if self.gradientClip > 0.0: self.dEdWnorm = np.sqrt(np.sum(np.power(dEdW, 2))) if self.dEdWnorm > self.gradientClip: dEdW *= self.gradientClip / self.dEdWnorm if self.learningRate > 0.0: self.lastdW = -self.learningRate * dEdW + \ self.momentum * self.lastdW self.W += self.lastdW if self.weightRegConst > 0.0: a = self.learningRate * self.weightRegConst self.W -= a * self.W if self.weightClip > 0.0: self.Wnorm = np.sqrt(np.sum(np.power(self.W, 2))) if self.Wnorm > self.weightClip: self.W *= self.weightClip / self.Wnorm
def _updateWeights(self, dEdW): if self.gpu: if self.gradientClip > 0.0: self.dEdWnorm = gpu.sqrt(gpu.sum(dEdW ** 2)) if self.dEdWnorm > self.gradientClip: dEdW *= self.gradientClip / self.dEdWnorm if self.learningRate > 0.0: self.lastdW = -self.learningRate * dEdW + \ self.momentum * self.lastdW self.W += self.lastdW if self.weightRegConst > 0.0: a = self.learningRate * self.weightRegConst self.W -= a * self.W if self.weightClip > 0.0: self.Wnorm = gpu.sqrt(gpu.sum(self.W ** 2)) if self.Wnorm > self.weightClip: self.W *= self.weightClip / self.Wnorm else: if self.gradientClip > 0.0: self.dEdWnorm = np.sqrt(np.sum(np.power(dEdW, 2))) if self.dEdWnorm > self.gradientClip: dEdW *= self.gradientClip / self.dEdWnorm if self.learningRate > 0.0: self.lastdW = -self.learningRate * dEdW + \ self.momentum * self.lastdW self.W += self.lastdW if self.weightRegConst > 0.0: a = self.learningRate * self.weightRegConst self.W -= a * self.W if self.weightClip > 0.0: self.Wnorm = np.sqrt(np.sum(np.power(self.W, 2))) if self.Wnorm > self.weightClip: self.W *= self.weightClip / self.Wnorm
def linear_decoder_run_gpu(data, numInput, numHidden): print "Starting Feature Abstraction..." num_input = numInput num_hidden = numHidden num_output = numInput lambda_val = 3e-3 sparsityParam = 0.035 beta = 5 inputs = data r = gpu.sqrt(6)/gpu.sqrt(num_hidden+num_input+1) weights1 = (gpu.rand(num_hidden,num_input+1))*2*r-r weights2 = (gpu.rand(num_output,num_hidden+1))*2*r-r num_weights1 = (num_input+1)*num_hidden num_weights2 = (num_hidden+1)*num_output #weights1 = reshape(weights1, num_weights1) weights1 = weights1.reshape(num_weights1) #weights2 = reshape(weights2, num_weights2) weights2 = weights2.reshape(num_weights2) weights = hstack((weights1.as_numpy_array(),weights2.as_numpy_array())) args = (num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta) opttheta = optimize.fmin_l_bfgs_b(costfunc_gpu, weights, fprime=grad_costfunc_gpu, args=args, maxiter=400) weights = opttheta[0] weights1 = reshape(weights[0:num_weights1],(num_hidden,num_input+1)) weights2 = reshape(weights[num_weights1:shape(weights)[0]], (num_output,num_hidden+1)) scipy.io.savemat('learntFeaturesGPU.mat', mdict={'learntFeatures': weights1}) return weights1
def updateParams(self,scale,update,log=False): if log: for w,u in zip(self.stack,update): wrms = gp.sqrt(gp.mean(w[0]**2)) urms = gp.sqrt(gp.mean((scale*u[0])**2)) print "weight rms=%f -- update rms=%f"%(wrms,urms) self.stack = [[ws[0]+scale*wsDelta[0],ws[1]+scale*wsDelta[1]] for ws,wsDelta in zip(self.stack,update)]
def initParams(self): sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s,self.mbSize)) for s in sizes] if self.train: self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def _dist_cosine(X, Y): """ d_ij = 0.5 * (1 - <x_i,y_j> / |x_i| |y_j|) This is a variant of the cosine similarity, modified so that it is a distance, i.e. the smaller the closer, and the distance is between 0 and 1. """ X_norm = gnp.sqrt((X*X).sum(axis=1)) + 1e-10 Y_norm = gnp.sqrt((Y*Y).sum(axis=1)) + 1e-10 return (1 - X.dot(Y.T) / (X_norm.reshape(-1,1).dot(Y_norm.reshape(1,-1)))) / 2
def mlpSoftmax_test(): numClasses = 10 inputSize = 28 * 28 l1Size = 100 l2Size = 20 lambda_softmax = 1e-4 lambda_hidden = 8e-5 print "Loading data..." inputs, labels, testData, testLabels = obtain_data() print shape(labels) print "Done." numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_L2 = l2Size * (l1Size + 1) num_weights_softmax = numClasses * l2Size r = gpu.sqrt(6)/gpu.sqrt(inputSize+l1Size+l2Size+1) theta_L1 = (gpu.rand(l1Size, inputSize+1))*2*r-r theta_L2 = (gpu.rand(l2Size, l1Size+1))*2*r-r theta_softmax = (gpu.rand(numClasses, l2Size))*2*r-r groundTruth = ground_Truth(labels,numCases) #theta_L1 = reshape(theta_L1, num_weights_L1) theta_L1 = theta_L1.reshape(num_weights_L1) #theta_L2 = reshape(theta_L2, num_weights_L2) theta_L2 = theta_L2.reshape(num_weights_L2) #theta_softmax = reshape(theta_softmax, num_weights_softmax) theta_softmax = theta_softmax.reshape(num_weights_softmax) theta = hstack((theta_L1.as_numpy_array(), theta_L2.as_numpy_array(), theta_softmax.as_numpy_array())) args = (numClasses, inputSize, l1Size, l2Size, lambda_softmax, lambda_hidden, inputs, labels, groundTruth) print "Starting Network Training..." opttheta = optimize.fmin_l_bfgs_b(mlpSoftmax_costfunc, theta, fprime=mlpSoftmax_grad, args=args, maxiter=400) theta = opttheta[0] print "Training finished." scipy.io.savemat('mlpSoftmax.mat', mdict={'theta': theta}) print "Now testing prediction accuracy..." theta_L1 = reshape(theta[0:num_weights_L1], (l1Size, inputSize + 1)) theta_L2 = reshape(theta[num_weights_L1:num_weights_L2+num_weights_L1], (l2Size, l1Size + 1)) theta_softmax = reshape(theta[num_weights_L2+num_weights_L1:shape(theta)[0]], (numClasses, l2Size)) numCasesPred = shape(testData)[1] testData = concatenate((ones((1,numCasesPred)), testData), axis = 0) hidden_sum_L1 = dot(theta_L1, testData) hidden_activation_L1 = 1/(1 + exp(-hidden_sum_L1)) hidden_activation_L1 = concatenate((ones((1,numCasesPred)), hidden_activation_L1), axis=0) hidden_sum_L2 = dot(theta_L2, hidden_activation_L1) hidden_activation_L2 = 1/(1 + exp(-hidden_sum_L2)) hidden_sum_softmax = dot(theta_softmax, hidden_activation_L2) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis = 0) predictions = exp(hidden_sum_softmax) predictions = predictions / predictions.sum(axis = 0) pred = predictions.argmax(axis=0) + 1 testLabels = squeeze(testLabels) accuracy = mean(pred == testLabels) * 100 print "Accuracy: ", accuracy, "%" return pred, testLabels
def apply_grad(self, learn_rate=1e-2,): """Apply the current accumulated gradients, with adagrad.""" # Update the adagrad "momentums" self.moms['W'] = (0.95 * self.moms['W']) + (0.05 * self.grads['W']**2.0) self.moms['b'] = (0.95 * self.moms['b']) + (0.05 * self.grads['b']**2.0) # Apply adagrad-style updates using current grads and moms self.params['W'] -= learn_rate * (self.grads['W'] / \ (gp.sqrt(self.moms['W']) + ADA_EPS)) self.params['b'] -= learn_rate * (self.grads['b'] / \ (gp.sqrt(self.moms['b']) + ADA_EPS)) # Reset gradient accumulators self.reset_grads() return
def initParams(self): """ Initialize parameters using 6/sqrt(fanin+fanout) """ sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s,1)) for s in sizes] if self.train: self.deltas = [gp.empty((s,1)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def mlpSoftmax_test(): numClasses = 10 inputSize = 28 * 28 l1Size = 100 lambda_softmax = 7e-4 lambda_hidden = 8e-6 print "Loading data..." inputs, labels, testData, testLabels = obtain_data() print shape(labels) print "Done." numCases = shape(inputs)[1] num_weights_L1 = l1Size * (inputSize + 1) num_weights_softmax = numClasses * l1Size r = gpu.sqrt(6) / gpu.sqrt(inputSize + l1Size + 1) theta_L1 = (gpu.rand(l1Size, inputSize + 1)) * 2 * r - r theta_softmax = (gpu.rand(numClasses, l1Size)) * 2 * r - r groundTruth = ground_Truth(labels, numCases) theta_L1 = theta_L1.reshape(num_weights_L1) theta_softmax = theta_softmax.reshape(num_weights_softmax) theta = hstack((theta_L1.as_numpy_array(), theta_softmax.as_numpy_array())) args = (numClasses, inputSize, l1Size, lambda_softmax, lambda_hidden, inputs, groundTruth) print "Starting Network Training..." opttheta = optimize.fmin_l_bfgs_b(mlpSoftmax1Layer_costfunc, theta, fprime=mlpSoftmax1Layer_grad, args=args, maxiter=300) theta = opttheta[0] print "Training finished." scipy.io.savemat('mlpSoftmax.mat', mdict={'theta': theta}) print "Now testing prediction accuracy..." theta_L1 = reshape(theta[0:num_weights_L1], (l1Size, inputSize + 1)) theta_softmax = reshape(theta[num_weights_L1:shape(theta)[0]], (numClasses, l1Size)) numCasesPred = shape(testData)[1] testData = concatenate((ones((1, numCasesPred)), testData), axis=0) hidden_sum_L1 = dot(theta_L1, testData) #hidden_activation_L1 = log(1+exp(hidden_sum_L1)) relu_mask_hidden1 = ones(shape(hidden_sum_L1)) * (hidden_sum_L1 > 0) hidden_activation_L1 = hidden_sum_L1 * relu_mask_hidden1 hidden_sum_softmax = dot(theta_softmax, hidden_activation_L1) hidden_sum_softmax = hidden_sum_softmax - hidden_sum_softmax.max(axis=0) predictions = exp(hidden_sum_softmax) predictions = predictions / predictions.sum(axis=0) pred = predictions.argmax(axis=0) + 1 testLabels = squeeze(testLabels) accuracy = mean(pred == testLabels) * 100 print "Accuracy: ", accuracy, "%" return accuracy
def multilayer_feature_learning(data, inputSize, l1Size, l2Size, sparsityParam, lambda_val, beta): print "Now starting feature abstraction..." num_input = inputSize num_hidden_L1 = l1Size num_hidden_L2 = l2Size num_output_L1 = inputSize num_output_L2 = num_hidden_L1 sparsityParam = sparsityParam lambda_val = lambda_val beta = beta inputs = gpu.garray(data) r = gpu.sqrt(6)/gpu.sqrt(num_hidden_L1+num_input+1) weights1_L1 = (gpu.rand(num_hidden_L1,num_input+1))*2*r-r weights2_L1 = (gpu.rand(num_output_L1,num_hidden_L1+1))*2*r-r num_weights1_L1 = (num_input+1)*num_hidden_L1 num_weights2_L1 = (num_hidden_L1+1)*num_output_L1 weights1_L1 = weights1_L1.reshape(num_weights1_L1) weights2_L1 = weights2_L1.reshape(num_weights2_L1) weights_L1 = hstack((weights1_L1.as_numpy_array(),weights2_L1.as_numpy_array())) print "Level 1 Abstraction Starting...." weights_L1 = linear_decoder_run_ReLU(data, weights_L1, num_input, num_hidden_L1) weights1_L1 = weights_L1[0:num_weights1_L1].reshape((num_hidden_L1,num_input+1)) weights2_L1 = weights_L1[num_weights1_L1:shape(weights_L1)[0]].reshape((num_output_L1,num_hidden_L1+1)) scipy.io.savemat('HiggsBosonLevel1.mat', mdict={'learntFeaturesL1_1': weights1_L1, 'learntFeaturesL1_2': weights2_L1}) L1_activation = feedforward(weights1_L1, inputs) del weights_L1 del weights1_L1 del weights2_L1 gpu.free_reuse_cache() v = gpu.sqrt(6)/gpu.sqrt(num_hidden_L2+num_hidden_L1+1) weights1_L2 = (gpu.rand(num_hidden_L2,num_hidden_L1+1))*2*v-v weights2_L2 = (gpu.rand(num_output_L2,num_hidden_L2+1))*2*v-v num_weights1_L2 = (num_hidden_L1+1)*num_hidden_L2 num_weights2_L2 = (num_hidden_L2+1)*num_output_L2 weights1_L2 = weights1_L2.reshape(num_weights1_L2) weights2_L2 = weights2_L2.reshape(num_weights2_L2) weights_L2 = hstack((weights1_L2.as_numpy_array(),weights2_L2.as_numpy_array())) print "Level 2 Abstraction Starting...." weights_L2 = linear_decoder_run_ReLU(L1_activation, weights_L2, num_hidden_L1, num_hidden_L2) weights1_L2 = weights_L2[0:num_weights1_L2].reshape((num_hidden_L2,num_hidden_L1+1)) weights2_L2 = weights_L2[num_weights1_L2:shape(weights_L2)[0]].reshape((num_output_L2,num_hidden_L2+1)) scipy.io.savemat('HiggsBosonLevel2.mat', mdict={'learntFeaturesL2_1': weights1_L2,'learntFeaturesL2_2': weights2_L2}) L2_activation = feedforward(weights1_L2, L1_activation) del weights_L2 del weights1_L2 del weights2_L2 gpu.free_reuse_cache() gpu.free_reuse_cache() print "Abstraction completed." return L2_activation
def checkGradientGPU(): num_input = 8 * 8 * 3 num_hidden = 10 num_output = num_input lambda_val = 0.003 sparsityParam = 0.035 beta = 5 data = scipy.io.loadmat('stlSampledPatches.mat') patches = data['patches'] inputs = patches[:, 0:10] r = gpu.sqrt(6) / gpu.sqrt(num_hidden + num_input + 1) weights1 = (gpu.rand(num_hidden, num_input + 1)) * 2 * r - r weights2 = (gpu.rand(num_output, num_hidden + 1)) * 2 * r - r num_weights1 = (num_input + 1) * num_hidden num_weights2 = (num_hidden + 1) * num_output weights1 = weights1.reshape(num_weights1) weights2 = weights2.reshape(num_weights2) weights = hstack((weights1.as_numpy_array(), weights2.as_numpy_array())) args = (num_input, num_hidden, num_output, inputs, lambda_val, sparsityParam, beta) numgrad = zeros(size(weights)) numgrad2 = zeros(size(weights)) perturb = zeros(size(weights)) e = 1e-4 for p in range(size(weights)): perturb[p] = e minus_weights = weights - perturb plus_weights = weights + perturb loss1 = costfunc_gpuTRY(minus_weights, *args) lossc1 = costfunc(minus_weights, *args) loss2 = costfunc_gpu(plus_weights, *args) lossc2 = costfunc(plus_weights, *args) numgrad[p] = (loss2 - loss1) / (2 * e) numgrad2[p] = (lossc2 - lossc1) / (2 * e) perturb[p] = 0 grad = grad_costfunc_gpu(weights, *args) grad2 = grad_costfunc(weights, *args) diff = linalg.norm(numgrad - grad) / linalg.norm(numgrad + grad) diff2 = linalg.norm(numgrad2 - grad2) / linalg.norm(numgrad2 + grad2) diff3 = linalg.norm(numgrad - grad2) / linalg.norm(numgrad + grad2) diff4 = linalg.norm(numgrad2 - grad) / linalg.norm(numgrad2 + grad) diffnum = linalg.norm(numgrad2 - numgrad) / linalg.norm(numgrad2 + numgrad) diffgrad = linalg.norm(grad2 - grad) / linalg.norm(grad2 + grad) print "pure GPU difference:", diff print "pure CPU difference:", diff2 print "GPU cost, CPU grad:", diff3 print "CPU cost, GPU grad:", diff4 print "CPU cost and GPU cost difference:", diffnum print "CPU grad and GPU grad difference:", diffgrad return "OK"
def checkGradientGPU(): num_input = 8*8*3 num_hidden = 10 num_output = num_input lambda_val = 0.003 sparsityParam = 0.035 beta = 5 data = scipy.io.loadmat('stlSampledPatches.mat') patches = data['patches'] inputs = patches[:,0:10] r = gpu.sqrt(6)/gpu.sqrt(num_hidden+num_input+1) weights1 = (gpu.rand(num_hidden,num_input+1))*2*r-r weights2 = (gpu.rand(num_output,num_hidden+1))*2*r-r num_weights1 = (num_input+1)*num_hidden num_weights2 = (num_hidden+1)*num_output weights1 = weights1.reshape(num_weights1) weights2 = weights2.reshape(num_weights2) weights = hstack((weights1.as_numpy_array(),weights2.as_numpy_array())) args = (num_input,num_hidden,num_output,inputs,lambda_val,sparsityParam,beta) numgrad = zeros(size(weights)) numgrad2 = zeros(size(weights)) perturb = zeros(size(weights)) e = 1e-4 for p in range(size(weights)): perturb[p] = e; minus_weights = weights - perturb plus_weights = weights + perturb loss1 = costfunc_gpuTRY(minus_weights, *args) lossc1 = costfunc(minus_weights, *args) loss2 = costfunc_gpuTRY(plus_weights, *args) lossc2 = costfunc(plus_weights, *args) numgrad[p] = (loss2 - loss1) / (2*e) numgrad2[p] = (lossc2 - lossc1) / (2*e) perturb[p] = 0 grad = grad_costfunc_gpu(weights, *args) grad2 = grad_costfunc(weights, *args) diff = linalg.norm(numgrad-grad)/linalg.norm(numgrad+grad) diff2 = linalg.norm(numgrad2-grad2)/linalg.norm(numgrad2+grad2) diff3 = linalg.norm(numgrad-grad2)/linalg.norm(numgrad+grad2) diff4 = linalg.norm(numgrad2-grad)/linalg.norm(numgrad2+grad) diffnum = linalg.norm(numgrad2-numgrad)/linalg.norm(numgrad2+numgrad) diffgrad = linalg.norm(grad2-grad)/linalg.norm(grad2+grad) print "pure GPU difference:",diff print "pure CPU difference:",diff2 print "GPU cost, CPU grad:",diff3 print "CPU cost, GPU grad:",diff4 print "CPU cost and GPU cost difference:",diffnum print "CPU grad and GPU grad difference:",diffgrad return "OK"
def norm_trans(X, mode='ff'): """Compute feedforward and backprop for unit-normalization.""" EPS = 0.00000001 if (mode == 'ff'): N = gp.sqrt(gp.sum(X**2.0, axis=1) + EPS) N = N[:, gp.newaxis] F = X / N if (mode == 'bp'): N = gp.sqrt(gp.sum(X['X']**2.0, axis=1) + EPS) N = N[:, gp.newaxis] V = X['dLdA'] * X['X'] V = gp.sum(V, axis=1) V = V[:, gp.newaxis] F = (X['dLdA'] / N) - (X['A'] * (V / (N**2.0))) return F
def norm_trans(X, mode='ff'): """Compute feedforward and backprop for unit-normalization.""" EPS = 0.00000001 if (mode == 'ff'): N = gp.sqrt(gp.sum(X**2.0, axis=1) + EPS) N = N[:,gp.newaxis] F = X / N if (mode == 'bp'): N = gp.sqrt(gp.sum(X['X']**2.0, axis=1) + EPS) N = N[:,gp.newaxis] V = X['dLdA'] * X['X'] V = gp.sum(V, axis=1) V = V[:,gp.newaxis] F = (X['dLdA'] / N) - (X['A'] * (V / (N**2.0))) return F
def rect_sqrt(x, computeGrad = False): if (not computeGrad): f = gp.sqrt(gp.abs(x)* (x>0)) return f g = 1 / (2*x + (x<=0))*(x>0) return g
def rect_sqrt(x, computeGrad=False): if (not computeGrad): f = gp.sqrt(gp.abs(x) * (x > 0)) return f g = 1 / (2 * x + (x <= 0)) * (x > 0) return g
def constrain_weights(self): for i, rms_limit in enumerate(self.rms_limits): if not rms_limit: continue W = self.weights[i] rms_scale = rms_limit / gnp.sqrt(gnp.mean(W*W, axis=0)) limit_rms = W * (1+(rms_scale < 1) * (rms_scale - 1)) self.weights[i] = limit_rms
def clip_params(self, max_norm=10.0): """Bound L2 (row-wise) norm of W by max_norm.""" M = self.params['W'] m_scales = max_norm / gp.sqrt(gp.sum(M**2.0,axis=1) + 1e-5) mask = (m_scales < 1.0) # with gnumpy, this already comes as float32 m_scales = (m_scales * mask) + (1.0 - mask) self.params['W'] = M * m_scales[:,gp.newaxis] return
def cosSimilar(a, b): global gpu res = 0 if gpu == 1: # print "gpu" a = gnumpy.garray(a) b = gnumpy.garray(b) len_a = gnumpy.sqrt(gnumpy.dot(a,a)) len_b = gnumpy.sqrt(gnumpy.dot(b,b)) res = gnumpy.dot(a, b) / (len_a * len_b) else: a = numpy.array(a) b = numpy.array(b) len_a = numpy.sqrt(numpy.dot(a,a)) len_b = numpy.sqrt(numpy.dot(b,b)) res = numpy.dot(a, b) / (len_a * len_b) return 0.5 + 0.5 * res
def limitColumnRMS(W, rmsLim): """ All columns of W with rms entry above the limit are scaled to equal the limit. The limit can either be a row vector or a scalar. Apply to 2-d array W. """ columnRMS = lambda W: gnp.sqrt(gnp.mean(W * W, axis=0)) rmsScale = rmsLim / columnRMS(W) return W * (1 + (rmsScale < 1) * (rmsScale - 1))
def fit(self, X): X = gnumpy.garray( X ) lens = (X ** 2).sum(-1) # precompute (squared) length of each vector if self._metric == 'angular': X /= gnumpy.sqrt(lens)[..., gnumpy.newaxis] # normalize index vectors to unit length self.index = X # np.ascontiguousarray(X, dtype=self._precision) elif self._metric == 'euclidean': self.index = X # np.ascontiguousarray(X, dtype=self._precision) self.lengths = lens # np.ascontiguousarray(lens, dtype=self._precision)
def _dist_euclidean(X, Y): """ d_ij = (x_i - y_j)^2 """ X = gnp.as_garray(X) Y = gnp.as_garray(Y) X_diag = (X*X).sum(axis=1) Y_diag = (Y*Y).sum(axis=1) return gnp.sqrt(-2 * X.dot(Y.T) + X_diag.reshape(-1,1) + Y_diag.reshape(1,-1) + 1e-3)
def initParams(self): # crude way of random initialization (random seed) for parameters import time self.seed = int(time.time()) % 100000; # for tt in range(self.seed): gp.rand() sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] self.hActs = [gp.empty((s,self.mbSize)) for s in sizes] if self.train: self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack] for tt in range(self.seed): gp.rand() self.stack = [[ws[0]+.01 * gp.randn(ws[0].shape),ws[1]+.01 * gp.randn(ws[1].shape)] for ws in self.stack]
def initParams(self): """ Initialize parameters using 6/sqrt(fanin+fanout) """ sizes = [self.inputDim]+self.layerSizes+[self.outputDim] scales = [gp.sqrt(6)/gp.sqrt(n+m) for n,m in zip(sizes[:-1],sizes[1:])] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] if self.temporalLayer > 0: rs = sizes[self.temporalLayer] s = gp.sqrt(6)/ rs # temporal layer stored at end of stack self.stack.append([gp.rand(rs,rs) * 2 * s - s, gp.zeros((2,1))]) if self.train: #TODO why store all deltas? #self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] #NOTE if a temporal layer is used it's already added to stack so will have a grad self.grad = [[gp.empty(w.shape),gp.empty(b.shape)] for w,b in self.stack]
def nrelu(data, wm, bias, sampling=False): """A noisy rectified linear unit. """ suff = gpu.dot(data, wm) + bias if sampling: sample = suff + (gpu.sqrt(suff.logistic()) * gpu.randn(suff.shape)) #sample = suff + gpu.randn(suff.shape) sample *= (sample > 0) else: sample = None suff *= (suff > 0) return suff, sample
def forward_prop(self, X, add_noise=False, compute_loss=False): """ Compute the forward propagation step that maps the input data matrix X into the output. """ self.mu = X.mean(axis=0) self.sigma = gnp.sqrt(((X - self.mu)**2).mean(axis=0)) self.X_hat = (X - self.mu) / (self.sigma + 1e-10) self.Y = self.X_hat * self.params.gamma + self.params.beta return self.Y
def calculate_dt(v, delta_v, N_bodies, alpha): a_max = 0. for i in range(N_bodies): delta_v = gpu.garray(delta_v) a = gpu.sum(delta_v[i,:]**2) if a > a_max: a_max = a a_max_index = i v = gpu.garray(v) v_mag = gpu.sqrt(gpu.sum(v[a_max_index,:]**2)) return alpha*v_mag/a_max
def calculate_dt(v, delta_v, N_bodies, alpha): a_max = 0. for i in range(N_bodies): delta_v = gpu.garray(delta_v) a = gpu.sum(delta_v[i, :]**2) if a > a_max: a_max = a a_max_index = i v = gpu.garray(v) v_mag = gpu.sqrt(gpu.sum(v[a_max_index, :]**2)) return alpha * v_mag / a_max
def rehu_trans(X, mode='ff'): """Compute feedforward and backprop for ReHu nonlinearity.""" if (mode == 'ff'): M_quad = (X > 0.0) M_line = (X > 0.5) M_quad = M_quad - M_line F = (M_line * (X - 0.25)) + (M_quad * X**2.0) if (mode == 'bp'): M_quad = (X['A'] < 0.25) M_line = 1.0 - M_quad F = (2.0 * M_quad * gp.sqrt(X['A'])) + M_line F = F * X['dLdA'] return F
def initParams(self): """ Initialize parameters using 6/sqrt(fanin+fanout) """ sizes = [self.inputDim] + self.layerSizes + [self.outputDim] scales = [ gp.sqrt(6) / gp.sqrt(n + m) for n, m in zip(sizes[:-1], sizes[1:]) ] self.stack = [[gp.rand(m,n)*2*s-s,gp.zeros((m,1))] \ for n,m,s in zip(sizes[:-1],sizes[1:],scales)] if self.temporalLayer > 0: rs = sizes[self.temporalLayer] s = gp.sqrt(6) / rs # temporal layer stored at end of stack self.stack.append([gp.rand(rs, rs) * 2 * s - s, gp.zeros((2, 1))]) if self.train: #TODO why store all deltas? #self.deltas = [gp.empty((s,self.mbSize)) for s in sizes[1:]] #NOTE if a temporal layer is used it's already added to stack so will have a grad self.grad = [[gp.empty(w.shape), gp.empty(b.shape)] for w, b in self.stack]
def output_and_cost(self, epoch, set_name = 'train'): self.timer_logger('output_and_cost {0}'.format(set_name), time.time()) self.results['current'] = self.output(self.results['current']) if self.problem == 'regression' and self.clip_values == 1: # clip values into the [0,1] range self.results['current'] = (self.results['current'] *(self.results['current'] >= 0)) self.results['current'] = (((self.results['current'] < 1)*self.results['current']) + (self.results['current'] > 1)) if set_name != 'train': if set_name == 'no_label_test': if 'prediction_test' not in self.results: if self.problem == 'classification': self.results['prediction_test'] = np.argmax(self.results['current'].as_numpy_array(),axis=1) else: self.results['prediction_test'] = self.results['current'].as_numpy_array() else: if self.problem == 'classification': self.results['prediction_test'] = np.hstack([self.results['prediction_test'],np.argmax(self.results['current'].as_numpy_array(),axis=1)]) else: self.results['prediction_test'] = np.vstack([self.results['prediction_test'],self.results['current'].as_numpy_array()]) elif set_name == 'cv_predict': if self.create_cv_predictions and set_name == 'cv_predict': if 'prediction_cv'not in self.results: self.results['prediction_cv'] = self.results['current'].as_numpy_array() else: self.results['prediction_cv'] = np.vstack([self.results['prediction_cv'],self.results['current'].as_numpy_array()]) else: if self.problem == 'classification': self.set_error_by_epoch[set_name][epoch] += (np.sum(np.equal(np.argmax(self.results['current'].as_numpy_array(),axis=1),self.batch_y.T))) else: self.set_error_by_epoch[set_name][epoch] += gpu.sqrt(gpu.sum(((self.results['current']-self.batch_y)**2)*float(self.batch.shape[0]))/float(self.y.shape[1])) if self.cost == 'auc': if self.problem == 'regression': if set_name + ' roc_auc' not in self.results: self.results[set_name + ' roc_auc'] = ([np.matrix(self.results['current'].as_numpy_array()).T, np.matrix(self.batch_y).T]) else: self.results[set_name + ' roc_auc'] = [np.hstack([self.results[set_name + ' roc_auc'][0],np.matrix(self.results['current'].as_numpy_array()).T]), np.hstack([self.results[set_name + ' roc_auc'][1],np.matrix(self.batch_y).T])] else: if set_name + ' roc_auc' not in self.results: self.results[set_name + ' roc_auc'] = ([np.matrix(self.results['current'].as_numpy_array()[:,1]).T, np.matrix(self.batch_y)]) else: self.results[set_name + ' roc_auc'] = [np.vstack([np.matrix(self.results[set_name + ' roc_auc'][0]),np.matrix(self.results['current'].as_numpy_array()[:,1]).T]), np.vstack([self.results[set_name + ' roc_auc'][1],np.matrix(self.batch_y)])] self.timer_logger('output_and_cost {0}'.format(set_name), time.time())
def euclidSimilar(a, b): global gpu res = 0 if gpu == 1: # print "gpu" a = gnumpy.garray(a) b = gnumpy.garray(b) c = a - b res = gnumpy.sqrt(gnumpy.dot(c, c)) else: a = numpy.array(a) b = numpy.array(b) c = a - b res = numpy.sqrt(numpy.dot(c, c)) return 1.0 / (1 + res)
def rmssd(z, targets, predict=False, error=False, addon=0): """ Root mean sum of squares. """ if predict: return z n, m = z.shape err = z - targets per_sample = gpu.sqrt(gpu.sum(err**2, axis=1) + 1e-8) if error: # rec. error + first deriv return gpu.sum(per_sample) / n + addon, err / ( n * per_sample[:, gpu.newaxis]) else: # only return reconstruction error return gpu.sum(per_sample) / n + addon
def bound_weights(self, Wm, wt_bnd): """Bound L2 (row-wise) norm of the weights in Wm by wt_bnd. This returns a garray if passed a garray, and performs all ops on the GPU if that is the case. Otherwise, it returns a numpy array, or if something besides an ndarray/garray was passed, it crashes (probably). """ EPS = 0.00000001 # Compute L2 norm of weights inbound to each node in this layer w_norms = gp.sqrt(gp.sum(Wm**2,axis=1) + EPS) # Compute scales based on norms and the upperbound set by wt_bnd w_scales = wt_bnd / w_norms mask = (w_scales < 1.0) w_scales = (w_scales * mask) + (1.0 - mask) w_scales = w_scales[:,gp.newaxis] # Rescale weights to meet the bound set by wt_bnd Wm = Wm * w_scales return Wm
def bound_weights(self, Wm, wt_bnd): """Bound L2 (row-wise) norm of the weights in Wm by wt_bnd. This returns a garray if passed a garray, and performs all ops on the GPU if that is the case. Otherwise, it returns a numpy array, or if something besides an ndarray/garray was passed, it crashes (probably). """ EPS = 0.00000001 # Compute L2 norm of weights inbound to each node in this layer w_norms = gp.sqrt(gp.sum(Wm**2, axis=1) + EPS) # Compute scales based on norms and the upperbound set by wt_bnd w_scales = wt_bnd / w_norms mask = (w_scales < 1.0) w_scales = (w_scales * mask) + (1.0 - mask) w_scales = w_scales[:, gp.newaxis] # Rescale weights to meet the bound set by wt_bnd Wm = Wm * w_scales return Wm
def computeStat(self): print 'Computing stats (mean and std)...' means = gp.zeros((self.numbatches, self.dim)) variances = gp.zeros((self.numbatches, self.dim)) i = 0 while True: batch = self.cache.getOneBatch() if batch == None: break means[i] = batch.mean(axis=0) variances[i] = gp.std(batch, axis=0)**2 i += 1 assert (i == self.numbatches) mean = means.mean(axis=0) std = gp.sqrt(variances.mean(axis=0) + gp.std(means, axis=0)**2) mean_std = std.mean() std += (std == 0.0) * mean_std self.reset() print 'Finish stats computing' return mean, std + 1e-10
def computeStat(self): print 'Computing stats (mean and std)...' means=gp.zeros((self.numbatches, self.dim)) variances=gp.zeros((self.numbatches, self.dim)) i=0 while True: batch=self.cache.getOneBatch() if batch==None: break means[i]=batch.mean(axis=0) variances[i]=gp.std(batch,axis=0)**2 i+=1 assert(i==self.numbatches) mean=means.mean(axis=0) std=gp.sqrt(variances.mean(axis=0)+gp.std(means,axis=0)**2) mean_std=std.mean() std+=(std==0.0)*mean_std self.reset() print 'Finish stats computing' return mean, std+1e-10
def unigram_partition(data_path, num_ensembles, model_name, method = 'none', train = True, random_training_order = False, reverse_order = False): algo_name = 'unigram' + '_' + 'partition' raw_data = reader.ptb_raw_data(data_path) train_data, valid_data, test_data, _, word_to_id = raw_data if reverse_order: train_data.reverse() valid_data.reverse() test_data.reverse() algo_name = 'reverse ' + algo_name eos_id = word_to_id['<eos>'] case_weight_length = len(train_data)-1 train_case_weights = np.repeat(1.0/case_weight_length, case_weight_length).tolist() train_sentence_list = reader.get_sentence_list(train_data, eos_id, reverse_order) if random_training_order: #train_sentence_list = reader.get_sentence_list(train_data, eos_id) perm = range(len(train_sentence_list)) np.random.shuffle(perm) train_data = [] for idx in perm: train_data += train_sentence_list[idx] train_sentence_list = reader.get_sentence_list(train_data, eos_id, reverse_order) num_sent = len(train_sentence_list) train_sentence_weights = np.repeat(1.0/num_sent, num_sent).tolist() new_train_data = train_data FLAGS.model = model_name config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 alpha_t_list = [] full_test_set_logits = [] for i in range(len(test_data)-1): full_test_set_logits.append(np.zeros((1,eval_config.vocab_size))) sentence_starters = [] id_to_sentence_num_dict = {} for i in range(num_sent): if reverse_order: desired_id = train_sentence_list[i][-1] else: desired_id = train_sentence_list[i][0] if desired_id in id_to_sentence_num_dict: id_to_sentence_num_dict[desired_id].append(i) else: id_to_sentence_num_dict[desired_id] = [i] sentence_starters.append(desired_id) id_to_model = {} for idx in sentence_starters: id_to_model[idx] = [1] id_to_weight = {} gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.5) for iii in range(num_ensembles): with tf.Graph().as_default(), tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) m = PTBModel(is_training=True, config=config) sess.close() with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config) mtest = PTBModel(is_training=False, config=eval_config) m2 = PTBModel(is_training=False, config=eval_config) tf.initialize_all_variables().run() saver = tf.train.Saver() if iii > 0: np.savetxt(checkpoint_dir + 'test_set_probs_no_alpha.out', np.squeeze(test_set_probs_no_alpha), delimiter = ',') if random_training_order: new_folder = 'random training order ' + algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1) else: new_folder = algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1) checkpoint_dir = 'simple-examples/ckpt/' + new_folder + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if iii > 0: for k,v in id_to_sentence_num_dict.items(): total_wt = 0 num_sentences = len(v) for sent_idx in v: total_wt += train_sentence_weights[sent_idx] id_to_weight[k] = total_wt/num_sentences sorted_id_to_weight = sorted(id_to_weight.items(), key = operator.itemgetter(1), reverse = True) np.savetxt(checkpoint_dir + 'sorted_id_to_weight', sorted_id_to_weight, delimiter = ',') new_train_data = [] i = 0 sent_included = 0 while sent_included < np.floor(num_sent/2): start_key = sorted_id_to_weight[i][0] sentence_additions = id_to_sentence_num_dict[start_key] for idx in sentence_additions: new_train_data += train_sentence_list[idx] sent_included += 1 id_to_model[start_key].append(iii + 1) i += 1 if train: np.savetxt(checkpoint_dir + 'train_case_weights.out', train_case_weights, delimiter = ',') np.savetxt(checkpoint_dir + 'train_sentence_weights.out', train_sentence_weights, delimiter = ',') for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, new_train_data, m.train_op, verbose=False) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op()) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) if (i+1) % 5 == 0 or (i+1) == config.max_max_epoch: saver.save(session, checkpoint_dir + 'model.ckpt', global_step = i+1) else: ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(session, ckpt.model_checkpoint_path) if train: case_scores = output_training_set_error_for_boosting(session, m2, train_data, tf.no_op()) score = sum(case_scores) norm = len(case_scores) * -1 * gpu.log(float(1.0) / eval_config.vocab_size) epsilon_t = (1 - (norm - score) / norm) / 2.0 alpha_t = 0.5 * gpu.log((1 - epsilon_t)/ epsilon_t) alpha_t_list.append(alpha_t) if iii == 0: shutil.rmtree('simple-examples/ckpt/' + 'random training order ' + algo_name + '_' + model_name + '/' + 'alpha_t.out', ignore_errors = True) with open('simple-examples/ckpt/' + 'random training order ' + algo_name + '_' + model_name + '/' + 'alpha_t.out', 'ab') as f: f.write(str(alpha_t)) f.write(',') train_case_weights = gpu.sqrt((1 - epsilon_t)/ epsilon_t) * np.multiply(train_case_weights, np.asarray(case_scores)) train_case_weights = np.ravel(normalize(np.asarray(train_case_weights).reshape(1,-1), norm = 'l1')) if method == 'stddev': new_train_case_weights = reject_outliers(train_case_weights) elif method == 'sqrt': new_train_case_weights = sqrt_norm(train_case_weights) else: new_train_case_weights = train_case_weights start_idx = 0 for i in range(len(train_sentence_list)): this_sentence_length = len(train_sentence_list[i]) sentence_tokens = [v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)] if len(sentence_tokens) == 0: this_sentence_weights[i] = 0 else: train_sentence_weights[i] = np.mean([v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)]) start_idx += this_sentence_length train_sentence_weights = np.ravel(normalize(np.asarray(train_sentence_weights).reshape(1,-1), norm = 'l1')) for k,v in id_to_sentence_num_dict.items(): total_wt = 0 num_sentences = len(v) for sent_idx in v: total_wt += train_sentence_weights[sent_idx] id_to_weight[k] = total_wt/num_sentences test_set_probs = output_test_set_probs(session, mtest, test_data, tf.no_op(), partition = True) test_set_probs_no_alpha = test_set_probs np.savetxt(checkpoint_dir + 'test_set_probs_no_alpha.out', np.squeeze(test_set_probs_no_alpha), delimiter = ',') train_set_probs = output_test_set_probs(session, m2, train_data, tf.no_op(), partition = True) train_set_probs_no_alpha = train_set_probs np.savetxt(checkpoint_dir + 'train_set_probs_no_alpha.out', np.squeeze(train_set_probs_no_alpha), delimiter = ',') test_perplexity = run_epoch(session, mtest, test_data, tf.no_op()) print("Test Perplexity: %.3f" % test_perplexity) if random_training_order: new_folder = 'random training order ' + algo_name + '_' + model_name else: new_folder = algo_name + '_' + model_name checkpoint_dir = 'simple-examples/ckpt/' + new_folder + '/' with open(checkpoint_dir + 'id_to_model.out', 'w') as f: for k,v in id_to_model.items(): f.write(str(k) + ',' + ','.join(str(id_to_model[k])) + '\n') with open(checkpoint_dir + 'id_to_sent_num.out', 'w') as ff: for k,v in id_to_sentence_num_dict.items(): ff.write(str(k) + ',' + ','.join(str(id_to_sentence_num_dict[k])) + '\n') print('Test PPL: ' + str(evaluate_unigram_partition(data = test_data, batch_size = 1, num_steps = 1, num_ensembles = num_ensembles, eos_id = eos_id, fp = 'simple-examples/ckpt/random training order unigram_partition_small/', probs_fn = 'test_set_probs_no_alpha.out'))) print('Train PPL: ' + str(evaluate_unigram_partition(data = train_data, batch_size = 1, num_steps = 1, num_ensembles = num_ensembles, eos_id = eos_id, fp = 'simple-examples/ckpt/random training order unigram_partition_small/', probs_fn = 'train_set_probs_no_alpha.out')))
def columnRMS(W): return gnp.sqrt(gnp.mean(W*W,axis=0))
def sqrt(x): check_type(x) if is_np(x): return np.sqrt(x) else: return gp.sqrt(x)
def task_loss(self, Y, Z, A=None, task_loss_fn=None): # root mean square error if task_loss_fn == None: return gnp.sqrt(((Y-Z)**2).mean()) else: return task_loss_fn(Y, Z, A)
def sampleStates(self, acts): if self.krizNoise: return self.activate(acts + gnp.randn(*acts.shape)) tiny = 1e-30 stddev = gnp.sqrt(acts.sigmoid() + tiny) return self.activate( acts + stddev*gnp.randn(*acts.shape) )
def compute_kernel_matrix(self, x): x = x if isinstance(x, gnp.garray) else gnp.garray(x) x_norm = gnp.sqrt((x**2).sum(axis=1)) x_norm = x_norm[:, gnp.newaxis] + x_norm[gnp.newaxis, :] + 1e-20 return x.dot(x.T) / x_norm
def sampleStates(self, acts): if self.krizNoise: return self.activate(acts + gnp.randn(*acts.shape)) tiny = 1e-30 stddev = gnp.sqrt(acts.sigmoid() + tiny) return self.activate(acts + stddev * gnp.randn(*acts.shape))
def ABISS(data_path, num_ensembles, model_name, method = 'stddev', train = True, random_training_order = False): algo_name = method + ' ' + 'ABISS' raw_data = reader.ptb_raw_data(data_path) train_data, valid_data, test_data, _, word_to_id = raw_data eos_id = word_to_id['<eos>'] case_weight_length = len(train_data)-1 train_case_weights = np.repeat(1.0/case_weight_length, case_weight_length).tolist() train_sentence_list = reader.get_sentence_list(train_data, eos_id) if random_training_order: perm = range(len(train_sentence_list)) np.random.shuffle(perm) train_data = [] for idx in perm: train_data += train_sentence_list[idx] train_sentence_list = reader.get_sentence_list(train_data, eos_id) num_sent = len(train_sentence_list) train_sentence_weights = np.repeat(1.0/num_sent, num_sent).tolist() new_train_data = reader.weighted_sentence_selection(train_sentence_list, train_sentence_weights, random_training_order) FLAGS.model = model_name config = get_config() eval_config = get_config() eval_config.batch_size = 1 eval_config.num_steps = 1 alpha_t_list = [] full_test_set_logits = [] for i in range(len(test_data)-1): full_test_set_logits.append(np.zeros((1,eval_config.vocab_size))) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.33) for iii in range(num_ensembles): with tf.Graph().as_default(), tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) with tf.variable_scope("model", reuse=None, initializer=initializer): sess = tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) m = PTBModel(is_training=True, config=config) sess.close() with tf.variable_scope("model", reuse=True, initializer=initializer): mvalid = PTBModel(is_training=False, config=config) mtest = PTBModel(is_training=False, config=eval_config) #config.batch_size = 1 m2 = PTBModel(is_training=False, config=eval_config) tf.initialize_all_variables().run() saver = tf.train.Saver() if random_training_order: new_folder = 'random training order' + algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1) else: new_folder = algo_name + '_' + model_name + '/' + 'ensemble' + str(iii + 1) checkpoint_dir = 'simple-examples/ckpt/' + new_folder + '/' if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if train: np.savetxt(checkpoint_dir + 'train_case_weights.out', train_case_weights, delimiter = ',') np.savetxt(checkpoint_dir + 'train_sentence_weights.out', train_sentence_weights, delimiter = ',') for i in range(config.max_max_epoch): lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0) m.assign_lr(session, config.learning_rate * lr_decay) print("Epoch: %d Learning rate: %.3f" % (i + 1, session.run(m.lr))) train_perplexity = run_epoch(session, m, new_train_data, m.train_op, verbose=False) print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity)) valid_perplexity = run_epoch(session, mvalid, valid_data, tf.no_op()) print("Epoch: %d Valid Perplexity: %.3f" % (i + 1, valid_perplexity)) if (i+1) % 5 == 0 or (i+1) == config.max_max_epoch: saver.save(session, checkpoint_dir + 'model.ckpt', global_step = i+1) else: ckpt = tf.train.get_checkpoint_state(checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(session, ckpt.model_checkpoint_path) case_scores = output_training_set_error_for_boosting(session, m2, train_data, tf.no_op()) score = sum(case_scores) norm = len(case_scores) * -1 * gpu.log(float(1.0) / eval_config.vocab_size) epsilon_t = (1 - (norm - score) / norm) / 2.0 alpha_t = 0.5 * gpu.log((1 - epsilon_t)/ epsilon_t) alpha_t_list.append(alpha_t) if iii == 0: shutil.rmtree('simple-examples/ckpt/' + algo_name + '_' + model_name + '/' + 'alpha_t.out', ignore_errors = True) with open('simple-examples/ckpt/' + algo_name + '_' + model_name + '/' + 'alpha_t.out', 'ab') as f: f.write(str(alpha_t)) f.write(',') train_case_weights = gpu.sqrt((1 - epsilon_t)/ epsilon_t) * np.multiply(train_case_weights, np.asarray(case_scores)) train_case_weights = np.ravel(normalize(np.asarray(train_case_weights).reshape(1,-1), norm = 'l1')) if method == 'stddev': new_train_case_weights = reject_outliers(train_case_weights) elif method == 'sqrt': new_train_case_weights = sqrt_norm(train_case_weights) start_idx = 0 for i in range(len(train_sentence_list)): this_sentence_length = len(train_sentence_list[i]) sentence_tokens = [v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)] if len(sentence_tokens) == 0: this_sentence_weights[i] = 0 else: train_sentence_weights[i] = np.mean([v for v in new_train_case_weights[start_idx:(this_sentence_length+start_idx)] if np.isfinite(v)]) start_idx += this_sentence_length train_sentence_weights = np.ravel(normalize(np.asarray(train_sentence_weights).reshape(1,-1), norm = 'l1')) new_train_data = reader.weighted_sentence_selection(train_sentence_list, train_sentence_weights, random_training_order) test_set_probs = output_test_set_probs(session, mtest, test_data, tf.no_op()) for i in range(len(test_set_probs)): test_set_probs[i] = test_set_probs[i] * alpha_t full_test_set_logits[i] += test_set_probs[i] test_perplexity = run_epoch(session, mtest, test_data, tf.no_op()) print("Test Perplexity: %.3f" % test_perplexity) alpha_t_sum = np.sum(alpha_t_list) for i in range(len(full_test_set_logits)): full_test_set_logits[i] = full_test_set_logits[i] / alpha_t_sum ensemble_perplexity = classify_ensemble(test_data, full_test_set_logits, 1, 1) print(ensemble_perplexity)
def compute_kernel_matrix(self, x): x = x if isinstance(x, gnp.garray) else gnp.garray(x) x_norm = gnp.sqrt((x**2).sum(axis=1)) x_norm = x_norm[:,gnp.newaxis] + x_norm[gnp.newaxis,:] + 1e-20 return x.dot(x.T) / x_norm
def output_and_cost(self, epoch, set_name='train'): self.timer_logger('output_and_cost {0}'.format(set_name), time.time()) self.results['current'] = self.output(self.results['current']) if self.problem == 'regression' and self.clip_values == 1: # clip values into the [0,1] range self.results['current'] = (self.results['current'] * (self.results['current'] >= 0)) self.results['current'] = (( (self.results['current'] < 1) * self.results['current']) + (self.results['current'] > 1)) if set_name != 'train': if set_name == 'no_label_test': if 'prediction_test' not in self.results: if self.problem == 'classification': self.results['prediction_test'] = np.argmax( self.results['current'].as_numpy_array(), axis=1) else: self.results['prediction_test'] = self.results[ 'current'].as_numpy_array() else: if self.problem == 'classification': self.results['prediction_test'] = np.hstack([ self.results['prediction_test'], np.argmax(self.results['current'].as_numpy_array(), axis=1) ]) else: self.results['prediction_test'] = np.vstack([ self.results['prediction_test'], self.results['current'].as_numpy_array() ]) elif set_name == 'cv_predict': if self.create_cv_predictions and set_name == 'cv_predict': if 'prediction_cv' not in self.results: self.results['prediction_cv'] = self.results[ 'current'].as_numpy_array() else: self.results['prediction_cv'] = np.vstack([ self.results['prediction_cv'], self.results['current'].as_numpy_array() ]) else: if self.problem == 'classification': self.set_error_by_epoch[set_name][epoch] += (np.sum( np.equal( np.argmax(self.results['current'].as_numpy_array(), axis=1), self.batch_y.T))) else: self.set_error_by_epoch[set_name][epoch] += gpu.sqrt( gpu.sum(((self.results['current'] - self.batch_y)**2) * float(self.batch.shape[0])) / float(self.y.shape[1])) if self.cost == 'auc': if self.problem == 'regression': if set_name + ' roc_auc' not in self.results: self.results[set_name + ' roc_auc'] = ([ np.matrix(self.results['current']. as_numpy_array()).T, np.matrix(self.batch_y).T ]) else: self.results[set_name + ' roc_auc'] = [ np.hstack([ self.results[set_name + ' roc_auc'][0], np.matrix(self.results['current']. as_numpy_array()).T ]), np.hstack([ self.results[set_name + ' roc_auc'][1], np.matrix(self.batch_y).T ]) ] else: if set_name + ' roc_auc' not in self.results: self.results[set_name + ' roc_auc'] = ([ np.matrix( self.results['current'].as_numpy_array() [:, 1]).T, np.matrix(self.batch_y) ]) else: self.results[set_name + ' roc_auc'] = [ np.vstack([ np.matrix(self.results[set_name + ' roc_auc'][0]), np.matrix(self.results['current']. as_numpy_array()[:, 1]).T ]), np.vstack([ self.results[set_name + ' roc_auc'][1], np.matrix(self.batch_y) ]) ] self.timer_logger('output_and_cost {0}'.format(set_name), time.time())