def applyDeltaWeights(self, dWList, dBList, updateOnlyLast, batchSize): if self.useRPROP: for i in reversed(xrange(self.NumberOfLayers - 1)): cp.rprop(self.Weights[i], dWList[i], self.DeltaWeightsOld[i], self.WeightsLearnRate[i], self.cfg.finetune_cost) cp.rprop(self.Bias[i], dBList[i], self.DeltaBiasOld[i], self.BiasLearnRate[i], self.cfg.finetune_cost) if updateOnlyLast: break else: for i in reversed(xrange(self.NumberOfLayers - 1)): W, B = self.Weights[i], self.Bias[i] dW, dWo = dWList[i], self.DeltaWeightsOld[i] dB, dBo = dBList[i], self.DeltaBiasOld[i] cp.apply_binary_functor(dW, dWo, cp.binary_functor.XPBY, self.cfg.finetune_momentum) cp.apply_binary_functor(dB, dBo, cp.binary_functor.XPBY, self.cfg.finetune_momentum) cp.learn_step_weight_decay( W, dW, self.cfg.finetune_learnrate / batchSize, self.cfg.finetune_cost) cp.learn_step_weight_decay( B, dB, self.cfg.finetune_learnrate / batchSize, self.cfg.finetune_cost) cp.copy(dWo, dW) cp.copy(dBo, dB) if updateOnlyLast: break
def test(self, input_matrix, teacher_matrix): """Function to test the network @param input_matrix -- matrix consisting of input data to the network. @param teacher_matrix -- matrix consisting of labels of input data . """ number_of_pictures = input_matrix.shape[-1] mse = 0 squared_errors = cp.dev_matrix_cmf(self.neuron_layer[-1].deltas.h, self.neuron_layer[-1].deltas.w) for batch in xrange(number_of_pictures/self.batch_size): index_begin = self.batch_size * batch index_end = index_begin + self.batch_size self.neuron_layer[0].activations = cp.push( input_matrix[:, index_begin:index_end].astype('float32').copy('F')) teachbatch = cp.push(teacher_matrix[:, index_begin:index_end].astype('float32').copy('F')) for i in xrange(self.number_of_layers): self.weight_layer[i].forward() cp.apply_binary_functor(squared_errors, self.neuron_layer[-1].deltas, cp.binary_functor.COPY) cp.apply_scalar_functor(squared_errors, cp.scalar_functor.SQUARE) mse += cp.sum(squared_errors) teachbatch.dealloc() print "MSE: ", (mse/number_of_pictures) squared_errors.dealloc()
def updateLayer(self, layernum, sample=True): L = self.layers[layernum] if layernum == 0: self.downPass(layernum + 1, sample=sample) if layernum == len(self.layers) - 1: self.upPass(layernum - 1, sample) if layernum < len(self.layers) - 1 and layernum > 0: hi = self.layers[layernum + 1] lo = self.layers[layernum - 1] wlo = self.weights[layernum - 1] whi = self.weights[layernum] cp.prod(L.act, whi.mat, hi.act, 'n', 'n') cp.matrix_plus_col(L.act, whi.bias_lo) tmp = L.act.copy() cp.prod(L.act, wlo.mat, lo.act, 't', 'n') cp.matrix_plus_col(L.act, wlo.bias_hi) # add parts from above/below cp.apply_binary_functor(L.act, tmp, cp.binary_functor.AXPBY, 0.5, 0.5) tmp.dealloc() L.nonlinearity() if sample: L.sample()
def updateLayer(self, layernum, sample = True): L = self.layers[layernum] if layernum == 0: self.downPass(layernum+1, sample = sample) if layernum == len(self.layers)-1: self.upPass(layernum-1, sample) if layernum<len(self.layers)-1 and layernum>0: hi = self.layers[layernum+1] lo = self.layers[layernum-1] wlo = self.weights[layernum-1] whi = self.weights[layernum] cp.prod(L.act, whi.mat, hi.act, 'n', 'n') cp.matrix_plus_col(L.act, whi.bias_lo) tmp = L.act.copy() cp.prod(L.act, wlo.mat, lo.act, 't', 'n') cp.matrix_plus_col(L.act, wlo.bias_hi) # add parts from above/below cp.apply_binary_functor(L.act, tmp, cp.binary_functor.AXPBY, 0.5, 0.5) tmp.dealloc() L.nonlinearity() if sample: L.sample()
def delta_hidden(self, weight, knownDerivative, netInput): deltaLo = cp.dev_tensor_float_cm([weight.shape[0], netInput.shape[1]]) cp.prod(deltaLo, weight, knownDerivative, 'n', 'n') help = netInput.copy() cp.apply_scalar_functor(help, cp.scalar_functor.DSIGM) cp.apply_binary_functor(deltaLo, help, cp.binary_functor.MULT) help.dealloc() return deltaLo
def delta_output(self, calculated, correct): derivative = cp.dev_tensor_float_cm([calculated.shape[0], correct.shape[1]]) h = cp.dev_tensor_float_cm(derivative.shape) cp.copy(derivative, calculated) cp.apply_scalar_functor(derivative, cp.scalar_functor.DSIGM) cp.copy(h, correct) cp.apply_binary_functor(h, calculated, cp.binary_functor.SUBTRACT) cp.apply_binary_functor(derivative, h, cp.binary_functor.MULT) h.dealloc() return derivative
def delta_output(self, calculated, correct): derivative = cp.dev_tensor_float_cm( [calculated.shape[0], correct.shape[1]]) h = cp.dev_tensor_float_cm(derivative.shape) cp.copy(derivative, calculated) cp.apply_scalar_functor(derivative, cp.scalar_functor.DSIGM) cp.copy(h, correct) cp.apply_binary_functor(h, calculated, cp.binary_functor.SUBTRACT) cp.apply_binary_functor(derivative, h, cp.binary_functor.MULT) h.dealloc() return derivative
def backward(self): """Backward pass, calculates the deltas of lower layer and later updates the weights.""" cp.prod(self.source.deltas, self.weight, self.target.deltas, 't', 'n') h = cp.dev_matrix_cmf(self.source.activations.h, self.source.activations.w) cp.apply_binary_functor(h, self.source.activations, cp.binary_functor.COPY) self.source.d_nonlinearity(h) cp.apply_binary_functor(self.source.deltas, h, cp.binary_functor.MULT) h.dealloc() self.weight_update()
def update_stats(self, batch): vmin = cp.dev_tensor_float(batch.shape[0]) vmax = cp.dev_tensor_float(batch.shape[0]) mean = cp.dev_tensor_float(batch.shape[0]) mean2 = cp.dev_tensor_float(batch.shape[0]) map(lambda x: cp.fill(x, 0), [mean, mean2]) cp.reduce_to_col(mean, batch) cp.reduce_to_col(mean2, batch, cp.reduce_functor.ADD_SQUARED) cp.reduce_to_col(vmin, batch, cp.reduce_functor.MIN) cp.reduce_to_col(vmax, batch, cp.reduce_functor.MAX) if "N" in self.__dict__: self.N += batch.shape[1] cp.apply_binary_functor(self.mean, mean, cp.binary_functor.ADD) cp.apply_binary_functor(self.mean2, mean2, cp.binary_functor.ADD) cp.apply_binary_functor(self.min, vmin, cp.binary_functor.MIN) cp.apply_binary_functor(self.max, vmin, cp.binary_functor.MAX) mean.dealloc() mean2.dealloc() vmin.dealloc() vmax.dealloc() else: self.N = batch.shape[1] self.mean = mean self.mean2 = mean2 self.min = vmin self.max = vmax
def delta_outputSoftMax(self, calculated, correct): derivative = calculated.copy() cp.apply_scalar_functor(derivative, cp.scalar_functor.EXP) sums = cp.dev_tensor_float(calculated.shape[1]) cp.fill(sums,0) cp.reduce_to_row(sums, derivative, cp.reduce_functor.ADD) cp.apply_scalar_functor(sums,cp.scalar_functor.ADD,0.1/derivative.shape[0]) rv = cp.transposed_view(derivative) cp.matrix_divide_col(rv,sums) cp.apply_binary_functor(derivative, correct, cp.binary_functor.AXPBY, -1.,1.) sums.dealloc() return derivative
def update_stats(self,batch): vmin = cp.dev_tensor_float(batch.shape[0]) vmax = cp.dev_tensor_float(batch.shape[0]) mean = cp.dev_tensor_float(batch.shape[0]) mean2 = cp.dev_tensor_float(batch.shape[0]) map(lambda x: cp.fill(x,0), [mean,mean2]) cp.reduce_to_col(mean,batch) cp.reduce_to_col(mean2,batch,cp.reduce_functor.ADD_SQUARED) cp.reduce_to_col(vmin,batch,cp.reduce_functor.MIN) cp.reduce_to_col(vmax,batch,cp.reduce_functor.MAX) if "N" in self.__dict__: self.N += batch.shape[1] cp.apply_binary_functor(self.mean, mean, cp.binary_functor.ADD) cp.apply_binary_functor(self.mean2,mean2,cp.binary_functor.ADD) cp.apply_binary_functor(self.min,vmin,cp.binary_functor.MIN) cp.apply_binary_functor(self.max,vmin,cp.binary_functor.MAX) mean.dealloc() mean2.dealloc() vmin.dealloc() vmax.dealloc() else: self.N = batch.shape[1] self.mean = mean self.mean2 = mean2 self.min = vmin self.max = vmax
def delta_outputSoftMax(self, calculated, correct): derivative = calculated.copy() cp.apply_scalar_functor(derivative, cp.scalar_functor.EXP) sums = cp.dev_tensor_float(calculated.shape[1]) cp.fill(sums, 0) cp.reduce_to_row(sums, derivative, cp.reduce_functor.ADD) cp.apply_scalar_functor(sums, cp.scalar_functor.ADD, 0.1 / derivative.shape[0]) rv = cp.transposed_view(derivative) cp.matrix_divide_col(rv, sums) cp.apply_binary_functor(derivative, correct, cp.binary_functor.AXPBY, -1., 1.) sums.dealloc() return derivative
def backward(self, output, teacher, indices, batchSize, updateOnlyLast, batch_idx): deltaWeights = [] deltaBias = [] derivative = [] if self.cfg.finetune_softmax: derivative.append(self.delta_outputSoftMax(output[-1], teacher)) else: derivative.append(self.delta_output(output[-1], teacher)) for i in reversed(xrange(1, self.NumberOfLayers - 1)): derivative.append( self.delta_hidden(self.Weights[i], derivative[-1], output[i])) derivative.reverse() #DeltaWeights for i in reversed(xrange(self.NumberOfLayers - 1)): deltaWeights.append( self.calculateDeltaWeights(derivative[i], output[i], self.Weights[i])) deltaWeights.reverse() #DeltaBias for i in xrange(self.NumberOfLayers - 1): self.createFilled(deltaBias, self.Bias[i].size, 1, 0) cp.reduce_to_col(deltaBias[-1], derivative[i]) # Weight Update if self.cfg.finetune_online_learning and not self.useRPROP: self.applyDeltaWeights(deltaWeights, deltaBias, updateOnlyLast, batchSize) elif self.cfg.finetune_online_learning and self.useRPROP and batch_idx % 16 == 0: self.applyDeltaWeights(self.dWeights, self.dBias, updateOnlyLast, batchSize) map(lambda x: cp.fill(x, 0), self.dWeights) map(lambda x: cp.fill(x, 0), self.dBias) else: for i in xrange(self.NumberOfLayers - 1): cp.apply_binary_functor(self.dWeights[i], deltaWeights[i], cp.binary_functor.ADD) cp.apply_binary_functor(self.dBias[i], deltaBias[i], cp.binary_functor.ADD) da = lambda x: x.dealloc() map(da, deltaWeights) map(da, deltaBias) map(da, derivative)
def applyDeltaWeights(self, dWList,dBList, updateOnlyLast, batchSize): if self.useRPROP: for i in reversed(xrange(self.NumberOfLayers-1)): cp.rprop(self.Weights[i], dWList[i], self.DeltaWeightsOld[i], self.WeightsLearnRate[i], self.cfg.finetune_cost) cp.rprop(self.Bias[i], dBList[i], self.DeltaBiasOld[i], self.BiasLearnRate[i], self.cfg.finetune_cost) if updateOnlyLast: break else: for i in reversed(xrange(self.NumberOfLayers-1)): W, B = self.Weights[i], self.Bias[i] dW,dWo = dWList[i], self.DeltaWeightsOld[i] dB,dBo = dBList[i], self.DeltaBiasOld[i] cp.apply_binary_functor( dW, dWo, cp.binary_functor.XPBY, self.cfg.finetune_momentum) cp.apply_binary_functor( dB, dBo, cp.binary_functor.XPBY, self.cfg.finetune_momentum) cp.learn_step_weight_decay(W, dW, self.cfg.finetune_learnrate/batchSize, self.cfg.finetune_cost) cp.learn_step_weight_decay(B, dB, self.cfg.finetune_learnrate/batchSize, self.cfg.finetune_cost) cp.copy(dWo,dW) cp.copy(dBo,dB) if updateOnlyLast: break
def finalize_stats(self): """ use N, mean and mean2 to generate data for normalization """ # mean := (mean/N)^2 cp.apply_scalar_functor(self.mean, cp.scalar_functor.MULT, 1. / self.N) sqmean = self.mean.copy() cp.apply_scalar_functor(sqmean, cp.scalar_functor.SQUARE) # mean2 -= mean2/n - squared_mean cp.apply_scalar_functor(self.mean2, cp.scalar_functor.MULT, 1. / self.N) cp.apply_binary_functor(self.mean2, sqmean, cp.binary_functor.SUBTRACT) # std is sqrt of difference cp.apply_scalar_functor(self.mean2, cp.scalar_functor.ADD, 0.01) # numerical stability cp.apply_scalar_functor(self.mean2, cp.scalar_functor.SQRT) self.std = self.mean2 sqmean.dealloc() # negate mean (so we can add it to normalize a matrix) cp.apply_scalar_functor(self.mean, cp.scalar_functor.MULT, -1.) self.negative_mean = self.mean # calculate range cp.apply_binary_functor(self.max, self.min, cp.binary_functor.SUBTRACT) cp.apply_scalar_functor(self.max, cp.scalar_functor.MAX, 1.) self.range = self.max # calculate negative min cp.apply_scalar_functor(self.range, cp.scalar_functor.ADD, 0.01) # numerical stability cp.apply_scalar_functor(self.min, cp.scalar_functor.MULT, -1.) self.negative_min = self.min assert not cp.has_nan(self.negative_mean) assert not cp.has_inf(self.negative_mean) assert not cp.has_nan(self.std) assert not cp.has_inf(self.std) assert not cp.has_nan(self.negative_min) assert not cp.has_inf(self.range)
def backward(self, output, teacher, indices, batchSize, updateOnlyLast, batch_idx): deltaWeights = [] deltaBias = [] derivative = [] if self.cfg.finetune_softmax: derivative.append(self.delta_outputSoftMax(output[-1], teacher)) else: derivative.append(self.delta_output(output[-1], teacher)) for i in reversed(xrange(1,self.NumberOfLayers-1)): derivative.append(self.delta_hidden(self.Weights[i], derivative[-1], output[i])) derivative.reverse() #DeltaWeights for i in reversed(xrange(self.NumberOfLayers-1)): deltaWeights.append(self.calculateDeltaWeights(derivative[i], output[i],self.Weights[i])) deltaWeights.reverse() #DeltaBias for i in xrange(self.NumberOfLayers-1): self.createFilled(deltaBias, self.Bias[i].size, 1, 0) cp.reduce_to_col(deltaBias[-1], derivative[i]) # Weight Update if self.cfg.finetune_online_learning and not self.useRPROP: self.applyDeltaWeights(deltaWeights,deltaBias,updateOnlyLast,batchSize) elif self.cfg.finetune_online_learning and self.useRPROP and batch_idx%16 == 0: self.applyDeltaWeights(self.dWeights,self.dBias,updateOnlyLast, batchSize) map(lambda x: cp.fill(x,0),self.dWeights) map(lambda x: cp.fill(x,0),self.dBias) else: for i in xrange(self.NumberOfLayers-1): cp.apply_binary_functor(self.dWeights[i], deltaWeights[i], cp.binary_functor.ADD) cp.apply_binary_functor(self.dBias[i], deltaBias[i], cp.binary_functor.ADD) da = lambda x:x.dealloc() map(da, deltaWeights) map(da, deltaBias) map(da, derivative)
def get_partition_function(self): tmp = cp.dev_tensor_float_cm([self.cfg.chains, 1]) tmp2 = cp.dev_tensor_float_cm([self.num_hids,self.cfg.chains]) #steps = 14500 #steps = 1000 steps = self.cfg.steps #beta=0.001 beta = 1.0/steps beta_old=0 for step in xrange(steps): self.p_k(beta_old,tmp,tmp2,lambda x: cp.apply_binary_functor(self.r,x,cp.binary_functor.SUBTRACT)) self.p_k(beta,tmp,tmp2,lambda x: cp.apply_binary_functor(self.r,x,cp.binary_functor.ADD)) self.sample_markov_chains(beta,step) ### sample v_i ### increase beta beta_old = beta #if step<500: #beta += 0.001 #elif step < 4500: #beta += 0.0001 #else : #beta += 0.00001 beta += 1.0/steps #if step % 100 == 0: #self.r_=self.r.np #v_=self.v.np #h_=self.h.np #print "v: %f"%v_.mean() #print "h: %f"%h_.mean() #print "r: %f"%self.r_.mean() #sys.stdout.write('.') #sys.stdout.flush() ### multiply r by partition function of baseline rbm self.r_=self.r.np self.partition_baserate = (np.log(1+np.exp(self.baserate_bias_))).sum()+self.num_hids*np.log(2) self.r_ += self.partition_baserate tmp.dealloc() tmp2.dealloc()
def finalize_stats(self): """ use N, mean and mean2 to generate data for normalization """ # mean := (mean/N)^2 cp.apply_scalar_functor(self.mean,cp.scalar_functor.MULT,1./self.N) sqmean = self.mean.copy() cp.apply_scalar_functor(sqmean, cp.scalar_functor.SQUARE) # mean2 -= mean2/n - squared_mean cp.apply_scalar_functor(self.mean2,cp.scalar_functor.MULT,1./self.N) cp.apply_binary_functor(self.mean2,sqmean,cp.binary_functor.SUBTRACT) # std is sqrt of difference cp.apply_scalar_functor(self.mean2,cp.scalar_functor.ADD,0.01) # numerical stability cp.apply_scalar_functor(self.mean2,cp.scalar_functor.SQRT) self.std = self.mean2 sqmean.dealloc() # negate mean (so we can add it to normalize a matrix) cp.apply_scalar_functor(self.mean,cp.scalar_functor.MULT,-1.) self.negative_mean = self.mean # calculate range cp.apply_binary_functor(self.max, self.min, cp.binary_functor.SUBTRACT) cp.apply_scalar_functor(self.max, cp.scalar_functor.MAX, 1.) self.range = self.max # calculate negative min cp.apply_scalar_functor(self.range,cp.scalar_functor.ADD,0.01) # numerical stability cp.apply_scalar_functor(self.min,cp.scalar_functor.MULT,-1.) self.negative_min = self.min assert not cp.has_nan(self.negative_mean) assert not cp.has_inf(self.negative_mean) assert not cp.has_nan(self.std) assert not cp.has_inf(self.std) assert not cp.has_nan(self.negative_min) assert not cp.has_inf(self.range)
def getErr(self, layernum, orig_data): cp.apply_binary_functor(self.layers[layernum].act, orig_data, cp.binary_functor.SUBTRACT) sqerr = cp.norm2(self.layers[layernum].act)**2 return sqerr / ((self.layers[layernum].size) * self.cfg.batchsize)
def getErr(self, layernum, orig_data): cp.apply_binary_functor(self.layers[layernum].act, orig_data, cp.binary_functor.SUBTRACT) sqerr = cp.norm2(self.layers[layernum].act)**2 return sqerr/((self.layers[layernum].size)*self.cfg.batchsize)
import cuv_python as cp C = cp.dev_tensor_float_cm([2048,2048]) # column major tensor A = cp.dev_tensor_float_cm([2048,2048]) B = cp.dev_tensor_float_cm([2048,2048]) cp.fill(C,0) # fill with some defined values, not really necessary here cp.sequence(A) cp.sequence(B) cp.apply_binary_functor(B,A,cp.binary_functor.MULT) # elementwise multiplication B *= A # operators also work (elementwise) cp.prod(C,A,B,'n','t') # matrix multiplication C = cp.prod(A, B.T) # numpy-like form, allocates new matrix for result
class MLP: """ A Multi-Layer Perceptron """ def __init__(self, neurons, batch_size): """Constructor @param neurons -- array of sizes of layers. @param batch_size -- size of batch being used for training. """ self.number_of_layers = len(neurons) - 1 self.batch_size = batch_size self.neuron_layer = [] self.weight_layer = [] for i in xrange(self.number_of_layers+1): dim1 = neurons[i] self.neuron_layer.append(neuron_layer(dim1, self.batch_size )) for i in xrange(self.number_of_layers): self.weight_layer.append(weight_layer(self.neuron_layer[i], self.neuron_layer[i+1])) def train(self, input_matrix, teacher_matrix, number_of_epochs): """Function to train the network @param input_matrix -- matrix consisting of input data to the network. @param teacher_matrix -- matrix consisting of labels of input data. @param number_of_epochs -- number of rounds the network is to be trained. """ number_of_pictures = input_matrix.shape[-1] squared_errors = cp.dev_matrix_cmf(self.neuron_layer[-1].deltas.h, self.neuron_layer[-1].deltas.w) for r in xrange(number_of_epochs): print "Epoch ", r+1, "/", number_of_epochs mse = 0 for batch in xrange(number_of_pictures/self.batch_size): index_begin = self.batch_size * batch index_end = self.batch_size + index_begin # Push input and teacher to GPU memory self.neuron_layer[0].activations = cp.push( input_matrix[:,index_begin:index_end].astype('float32').copy('F')) teachbatch = cp.push( teacher_matrix[:,index_begin:index_end].astype('float32').copy('F')) # Forward-Pass for i in xrange(self.number_of_layers): self.weight_layer[i].forward() # calculate error at output layer cp.apply_binary_functor(self.neuron_layer[-1].deltas, teachbatch, cp.binary_functor.COPY) cp.apply_binary_functor(self.neuron_layer[-1].deltas, self.neuron_layer[-1].activations, cp.binary_functor.SUBTRACT) cp.apply_binary_functor(squared_errors, self.neuron_layer[-1].deltas, cp.binary_functor.COPY) cp.apply_scalar_functor(squared_errors, cp.scalar_functor.SQUARE) mse += cp.sum(squared_errors) # Backward-Pass for i in xrange(self.number_of_layers): self.weight_layer[self.number_of_layers-i-1].backward() # Don't wait for garbage collector teachbatch.dealloc() self.neuron_layer[0].activations.dealloc() print "MSE: ", (mse/number_of_pictures) squared_errors.dealloc()
import cuv_python as cp C = cp.dev_tensor_float_cm([2048, 2048]) # column major tensor A = cp.dev_tensor_float_cm([2048, 2048]) B = cp.dev_tensor_float_cm([2048, 2048]) cp.fill(C, 0) # fill with some defined values, not really necessary here cp.sequence(A) cp.sequence(B) cp.apply_binary_functor(B, A, cp.binary_functor.MULT) # elementwise multiplication B *= A # operators also work (elementwise) cp.prod(C, A, B, 'n', 't') # matrix multiplication C = cp.prod(A, B.T) # numpy-like form, allocates new matrix for result