def visActProb(self): # negative phase super(DiscriminativeRBM,self).visActProb() self.v.apply_sigmoid() cm.dot(self.cW, self.h, target = self.c) self.c.add_col_vec(self.cb) softmax(self.c)
def _sample_h(self, v, x, sample=False, x_is_bias=False): # updates self.h # self.h = cm.empty((v.shape[0], self.output_dim)) if x_is_bias: # Bias is precalculated self.h.assign(x) else: cm.dot(x, self.bg, self.h) self.h.add_dot(v, self.wg) # This is a 100 times faster than calling 'add_row_vec' to add biases. ones_cut = self._ones.get_col_slice(0, v.shape[0]) self.h.add_dot(ones_cut.T, self.bhg) self.h.apply_sigmoid2(self.h) if sample: # Sample random values sampled = cm.empty((v.shape[0], self.output_dim)) sampled.fill_with_rand() # Sample values of hiddens sampled.less_than(self.h, self.h)
def acceleration(self): #this sets self.hActProbs and self.normalizedVisMB and self.sqColLens self.hidActProbs(vis = self.negVis) cm.dot(self.factToHid, self.hActProbs, target = self.tempFactMB) self.tempFactMB.mult(-1) self.tempFactMB.mult(self.factResponses) cm.dot(self.visToFact, self.tempFactMB, target = self.normalizedAccel) #rename some things to be like Marc'Aurelio's code: normcoeff = self.tempRow2 lengthsq = self.tempRow #these next few lines repeat some work, but it is too confusing to cache all this stuff at the moment self.sqColLens.mult(1.0/self.numVis, target = lengthsq) lengthsq.add(small) #self.tempRow is what Marc'Aurelio calls lengthsq cm.sqrt(lengthsq, target = normcoeff) normcoeff.mult(lengthsq) #now self.tempRow2 has what Marc'Aurelio calls normcoeff normcoeff.reciprocal() self.normalizedAccel.mult(self.negVis, target = self.tempVisMB) self.tempVisMB.sum(axis=0, target = self.tempRow3) #this tempRow stuff is getting absurd self.tempRow3.mult(-1.0/self.numVis) self.negVis.mult_by_row(self.tempRow3, target = self.tempVisMB) self.normalizedAccel.mult_by_row(lengthsq, target = self.accel) self.accel.add(self.tempVisMB) self.accel.mult_by_row(normcoeff) #quadratic in v term contribution to gradient self.accel.add(self.negVis) self.accel.mult(2) #all parts before this point have a 2 show up because of differentiation #vis bias contribution self.accel.add_col_mult(self.visBias, -1)
def get_specrad(Ac): """Get spectral radius of A using the power method.""" m_size = Ac.shape[0] x = np.random.normal(0, 1, (m_size, 1)) x = x / np.linalg.norm(x) x = cm.CUDAMatrix(x) y = cm.empty((m_size, 1)) diff = 200 eps = 1e-3 b = 1e10 c = 1e9 max_its = 1e6 n_its = 0 while diff > eps and n_its < max_its: cm.dot(Ac, x, target=y) norm = y.euclid_norm() y.divide(norm, target=x) a = cm.dot(y.T, x).asarray() c = cm.dot(x.T, x).asarray() diff = np.abs(a - b) b = float(a) n_its += 1 specrad = float(a / c) print 'Spectral radius:', specrad, 'Number of iterations:', n_its return float(a / c)
def rbmHtoV(m, X) : """convey data fron hidden layer to visible layer""" cm.cublas_init() # copy data to GPU data = cm.CUDAMatrix(cm.reformat(X)) weight = cm.CUDAMatrix(cm.reformat(m.weight)) biasV = cm.CUDAMatrix(cm.reformat(m.biasV)) nCase = X.shape[0] nVis = biasV.asarray().size VisActP = cm.CUDAMatrix(np.zeros((nCase, nVis))) if m.type == "BB" : cm.dot(data, weight.T, target = VisActP) VisActP.add_row_vec(biasV) VisActP.apply_sigmoid() elif m.type == "BG" : cm.dot(data, weight.T, target = VisActP) VisActP.add_row_vec(biasV) elif m.type == "GB" : pass result = VisActP.asarray() #free device memory data.free_device_memory() weight.free_device_memory() biasV.free_device_memory() VisActP.free_device_memory() cm.shutdown() return result
def ExactZ_binary_binary(model): assert len(model.layer) == 2, 'Only implemented for RBMs.' steps = len(schedule) input_layer = model.layer[0] hidden_layer = model.layer[1] edge = model.edge[0] w = edge.params['weight'] a = hidden_layer.params['bias'] b = input_layer.params['bias'] numvis, numhid = w.shape batchsize = 2**numvis input_layer.AllocateBatchsizeDependentMemory(batchsize) hidden_layer.AllocateBatchsizeDependentMemory(batchsize) all_inputs = GetAll(numvis) w_ais = cm.CUDAMatrix(np.zeros((1, batchsize))) input_layer.sample.overwrite(all_inputs) cm.dot(w.T, input_layer.sample, target=hidden_layer.state) hidden_layer.state.add_col_vec(a) cm.log_1_plus_exp(hidden_layer.state) w_ais.add_sums(hidden_layer.state, axis=0) w_ais.add_dot(b.T, input_layer.state) offset = float(w_ais.asarray().max()) w_ais.subtract(offset) cm.exp(w_ais) z = offset + np.log(w_ais.asarray().sum()) return z
def tests(): a = np.random.rand(300,500) b = np.random.rand(500,300) start = timer() c = np.dot(a,b) nptime = timer()-start print('nptime',nptime) x = np.array(np.random.rand(600,1500),dtype='float32',order='F') y = np.array(np.random.rand(1500,300),dtype='float32',order='F') z = np.zeros((1000,1000),order='F',dtype='float32') stream = cuda.stream() dx = cuda.to_device(x) dy = cuda.to_device(y) dz = cuda.to_device(z) start = timer() blas.gemm('N','N',1000,1500,1000,1.0,dx,dy,0.0,dz) cutime = timer()-start print('cutime',cutime) #dz.copy_to_host(z) print(dz[0]) c = np.ones((1000,1000),order='F',dtype='float32') print(c.shape) dc = cuda.to_device(c) # blockDim = (256,256) #gridDim = (((1000 + blockDim[0]-1)/blockDim[0]),((1000 + blockDim[1]-1)/blockDim[1])) blockDim = (30,30) gridDim = ((((c.shape[0] + blockDim[0]) - 1) / blockDim[0]), (((c.shape[1] + blockDim[1]) - 1) / blockDim[1])) start = timer() mtanh[gridDim,blockDim,stream](dc) tantime = timer() - start print('tantime',tantime) dc.copy_to_host(c,stream=stream) stream.synchronize() print(c) y = cm.CUDAMatrix(np.ones((1000,1000))) start = timer() cm.tanh(y) cmtan = timer()-start print('cmtan',cmtan) x = cm.CUDAMatrix(np.random.rand(1000,1500)) y = cm.CUDAMatrix(np.random.rand(1500,1000)) start = timer() cm.dot(x,y) cmtime = timer()-start print('cmtime',cmtime)
def visActProbs(self, recomputeDynamicBias): if recomputeDynamicBias: self.updateDynamicVisBias() cm.dot( self.visToHid, self.hActs, target = self.negVis) self.negVis.add(self.dynamicVisBias) self.negVis.add_col_vec(self.visBias)
def hidActProb(self,vis, target): # positive phase # print self.W.shape # print vis.shape # print target.shape cm.dot(self.W.T, vis, target = target) target.add_col_vec(self.hb) target.apply_sigmoid()
def transform(self, v, h): """ Parameters: v : the visible input activation h : the target to write the hidden activation """ cm.dot(self.W.T, v, target = h) h.add_col_vec(self.hidden_bias) h.apply_sigmoid()
def reverse_transform(self, h, v): """ Parameters: h : the hidden activation v : the target to write the visible activation """ cm.dot(self.W, h, target = v) v.add_col_vec(self.visible_bias) v.apply_sigmoid()
def rmatvec(x): if isinstance(x, np.ndarray): x.resize((x.size, 1)) x_gpu = CUDAMatrix(x) return cudamat.dot(a_gpu.transpose(), x_gpu).asarray() elif isinstance(x, CUDAMatrix): x_gpu = x return cudamat.dot(a_gpu.transpose(), x_gpu) else: raise ValueError('Expected CUDAMatrix or ndarray')
def hidActProbsRBM(self, vis = None): """ targ had better be on the gpu or None """ if vis == None: vis = self.vis targ = self.hActProbsRBM cm.dot( self.visToHid.T, vis, target = targ) targ.add_col_vec(self.hidBiasRBM) self.hNetInputsRBM.assign(targ) #needed later for Hamiltonian computation targ.apply_sigmoid()
def hidActProbs(self, targ = None, vis = None): """ targ had better be on the gpu or None """ if targ == None: targ = self.hActProbs if vis == None: vis = self.vis cm.dot( self.visToHid.T, vis, target = targ) targ.add_col_vec(self.hidBias) targ.apply_sigmoid()
def forward_p_single(self, single_z): self.single_z = single_z self.activation_func.apply(self.single_z) cm.dot(self.single_z, self.weights, self.next_single_z) if self.use_bias: self.biases.mult( self.activation_func.apply_scalar(1), self.active_biases ) self.next_single_z.add_row_vec(self.active_biases) return self.next_single_z
def backward_p(self, next_delta): # Compute weights grad. cm.dot(self.z.T, next_delta, self.weights_grad) # Compute biases grad. if self.use_bias: next_delta.sum(0, self.biases_grad) if self.level != 1: cm.dot(next_delta, self.weights.T, self.my_delta) self.activation_func.mult_with_derivative(self.my_delta, self.z) return self.my_delta
def ComputeUp(self, train=False, step=0, maxsteps=0): """ Computes the state of a layer, given the state of its incoming neighbours. Args: train: True if this computation is happening during training, False during evaluation. step: Training step. maxsteps: Maximum number of steps that will be taken (Some hyperparameters may depend on this.) """ logging.debug('ComputeUp in %s', self.name) self.dirty = False if self.is_input: self.GetData() else: for i, edge in enumerate(self.incoming_edge): if edge in self.outgoing_edge: continue inputs = self.incoming_neighbour[i].state if edge.conv: if i == 0: self.ConvolveUp(inputs, edge, self.state) else: self.AddConvoleUp(inputs, edge, self.state) else: w = edge.params['weight'] factor = edge.proto.up_factor if i == 0: cm.dot(w.T, inputs, target=self.state) if factor != 1: self.state.mult(factor) else: self.state.add_dot(w.T, inputs, mult=factor) b = self.params['bias'] if self.replicated_neighbour is None: self.state.add_col_vec(b) else: self.state.add_dot(b, self.replicated_neighbour.NN) self.ApplyActivation() if self.hyperparams.dropout: if train and maxsteps - step >= self.hyperparams.stop_dropout_for_last: # Randomly set states to zero. self.mask.fill_with_rand() self.mask.greater_than(self.hyperparams.dropout_prob) self.state.mult(self.mask) else: # Produce expected output. self.state.mult(1.0 - self.hyperparams.dropout_prob)
def UpdateStatesGPU(self, sType, _raaW, _raaB, _raaX, _raaY, _baaY, rDropout=0, bSample=False): # Compute the scale factor to compensate for dropout so that # average activations remain the same rScale = 1/(1-rDropout) # Compute activations cudamat.dot(_raaX, _raaW, target=_raaY) _raaY = _raaY.mult(rScale) _raaY.add_row_vec(_raaB) # Depending on the activation type... if (sType=="Logistic"): # Compute the logistic function _raaY.apply_sigmoid(_raaY) elif (sType=="Linear"): # Compute output layer states pass elif (sType=="HyperbolicTangent"): # Compute output layer states _raaY.apply_tanh(_raaY) # If stochastic binary states are required... if(bSample): # Depending on the activation type... if (sType=="Logistic"): # Sample output layer states _baaY.fill_with_rand() _baaY.less_than(_raaY) elif (sType=="Linear"): # Sample output layer states _baaY.fill_with_randn() _baaY.add(_raaY) elif (sType=="HyperbolicTangent"): # Sample output layer states _baaY.fill_with_rand() _baaY.mult(2) _baaY.sub(1) _baaY.less_than(_raaY)
def hidNetInpts(self, recomputeDynamicBias = True, targ = None, vis = None): """ targ had better be on the gpu or None """ if recomputeDynamicBias: self.updateDynamicHidBias() if targ == None: targ = self.hActProbs if vis == None: vis = self.vis cm.dot( self.visToHid.T, vis, target = targ) targ.add(self.dynamicHidBias) targ.add_col_vec(self.hidBias)
def test_T_field(): m = 256 n = 128 cm1 = np.array(np.random.rand(n, m)*10, dtype=np.float32, order='F') cm2 = np.array(np.random.rand(m, 1)*10, dtype=np.float32, order='F') gm1 = cm.CUDAMatrix(cm1) gm2 = cm.CUDAMatrix(cm2) # test dot gm = cm.dot(gm2.T, gm1.T) c = np.dot(cm2.T, cm1.T) gm.copy_to_host() assert np.max(np.abs(gm.numpy_array - c)) < 10**-2, "Error in CUDAMatrix.dot with TransposedCUDAMatrix exceeded threshold" # test add_dot cm3 = np.array(np.random.rand(1, n)*10, dtype=np.float32, order='F') gm3 = cm.CUDAMatrix(cm3) gm3.add_dot(gm2.T, gm1.T) c = cm3 + np.dot(cm2.T, cm1.T) gm3.copy_to_host() assert np.max(np.abs(gm3.numpy_array - c)) < 10**-2, "Error in CUDAMatrix.add_dot TransposedCUDAMatrix exceeded threshold" # test add_sums gm2.add_sums(gm1.T, axis = 1) c = cm2 + np.atleast_2d(cm1.sum(0)).T gm2.copy_to_host() assert np.max(np.abs(gm2.numpy_array - c)) < 10**-2, "Error in CUDAMatrix.add_sums TransposedCUDAMatrix exceeded threshold"
def negative_free_energy(self,gpu_data): """ Computes the negative free-energy. Outputs a reference to a pre-allocated GPU variable containing the result. """ cm.dot(self.W,gpu_data,self.gpu_h) self.gpu_h.add_col_vec(self.c) # to avoid memory creation, using gpu_h # and gpu_h_sample for these computations cm.exp(self.gpu_h,self.gpu_h_sample) self.gpu_h_sample.add(1.) cm.log(self.gpu_h_sample,self.gpu_h) self.gpu_h.sum(axis=0,target=self.gpu_negative_free_energy) self.gpu_negative_free_energy.add_dot(self.b.T,gpu_data) return self.gpu_negative_free_energy
def test(self, dev_test, dev_lbl): # forward pass cm.dot(self.w_w1.T, dev_test, target = self.h) self.h.add_col_vec(self.w_b1) self.h.apply_sigmoid() cm.dot(self.w_w2.T, self.h, target = self.out) self.out.add_col_vec(self.w_b2) self.out.apply_sigmoid() # compute error self.out.subtract(dev_lbl) print "Testing misclassification rate: " + str(np.mean(np.abs(self.out.asarray())>0.5))
def rbm_update(self,gpu_data): # Positive phase cm.dot(self.W,gpu_data,self.gpu_h) self.gpu_h.add_col_vec(self.c) self.gpu_h.apply_sigmoid() self.dW.mult(self.momentum) self.dc.mult(self.momentum) self.db.mult(self.momentum) self.dW.add_dot(self.gpu_h,gpu_data.T) self.dc.add_sums(self.gpu_h,axis=1,mult=1.) self.db.add_sums(gpu_data,axis=1,mult=1.) if self.use_persistent_chain: cm.dot(self.W,self.gpu_x_sample,self.gpu_h) self.gpu_h.add_col_vec(self.c) self.gpu_h.apply_sigmoid() for it in range(self.n_gibbs_steps): self.gpu_h_sample.fill_with_rand() self.gpu_h_sample.less_than(self.gpu_h) # Down pass cm.dot(self.W.T,self.gpu_h_sample,self.gpu_x) self.gpu_x.add_col_vec(self.b) self.gpu_x.apply_sigmoid() self.gpu_x_sample.fill_with_rand() self.gpu_x_sample.less_than(self.gpu_x) # Up pass cm.dot(self.W,self.gpu_x_sample,self.gpu_h) self.gpu_h.add_col_vec(self.c) self.gpu_h.apply_sigmoid() self.dW.subtract_dot(self.gpu_h,self.gpu_x_sample.T) self.dc.add_sums(self.gpu_h,axis=1,mult=-1.) self.db.add_sums(self.gpu_x_sample,axis=1,mult=-1.) # Update RBM self.W.add_mult(self.dW,alpha=self.learning_rate/self.minibatch_size) self.c.add_mult(self.dc,alpha=self.learning_rate/self.minibatch_size) self.b.add_mult(self.db,alpha=self.learning_rate/self.minibatch_size) #if self.print_first_row: # gpu_data.copy_to_host() # print gpu_data.numpy_array[:,0] # self.gpu_x.copy_to_host() # print self.gpu_x.numpy_array[:,0] # Compute reconstruction error self.gpu_x.subtract(gpu_data) err = self.gpu_x.euclid_norm() err = err**2 err /= self.gpu_x.shape[1] return err
def CDStats(self, vis, normalizedVis, hid, posPhase): multiplier = 1.0 if posPhase else -1.0 self.dhidBias.add_sums(hid, 1, mult = multiplier) self.dvisBias.add_sums(vis, 1, mult = multiplier) cm.dot(self.factToHid, hid, target = self.tempFactMB) self.tempFactMB.mult(self.factResponses) #I modified cudamat's add_dot to take a multiplier #need to multiply by 0.5 to make finite diffs agree # self.dfactToHid.add_dot(self.factResponsesSq, hid.T, mult = 0.5*multiplier) if posPhase: self.dvisToFact.add_dot(normalizedVis, self.tempFactMB.T) else: self.dvisToFact.subtract_dot(normalizedVis, self.tempFactMB.T)
def calculate_snprank(self, gamma, usegpu): """Runs the SNPrank algorithm on the input data, using gamma as the damping factor. usegpu enables GPU computing (using the CUDAMat library) for the matrix multiplication. Returns the SNPrank scores and diagonal (main effect) of original GAIN matrix.""" # A GAIN matrix is an NxN matrix m,n = self.GAIN.shape if m != n: raise ValueError("Input is not an NxN matrix") # Vector of column sums colsum = self.GAIN.sum(axis=0) # Get indices of c vector that are not zero colsum_nzidx = colsum.nonzero()[0] D = zeros((n,n)) T_nz = ones(n) # Where a column doesn't sum to 0, the diagonal in D # ought to be the reciprocal of the column sum. # Likewize T_nz ought to be 1-gamma rather than 1. for i in colsum_nzidx: D[i][i] = 1/colsum[i] T_nz[i] = 1 - gamma T = zeros((n,n)) if usegpu: import cudamat as cm # initialize CUDAMat cm.init() # Copy GAIN and D matrices to GPU G_gpu = cm.CUDAMatrix(self.GAIN) D_gpu = cm.CUDAMatrix(D) # Do matrix multiplication on the GPU GD_prod = cm.dot(G_gpu,D_gpu) # Transition matrix T = (gamma * GD_prod.asarray() ) + (self.GAIN.diagonal().reshape(n,1) * T_nz) / self.GAIN.trace() else: # Transition matrix T = (gamma * dot(self.GAIN,D) ) + (self.GAIN.diagonal().reshape(n,1) * T_nz) / self.GAIN.trace() # r is an arbitrary vector, which we initialize to 1/n r = (ones(n)).reshape(n,1)/n; # Cutoff for matrix convergence threshold = 10**(-4) # Multiply r by T until r converges to within the threshold while True: r_old, r = r, normalize(dot(T,r)) if all( abs(r-r_old) < threshold ): break return r.reshape(1,n)[0], self.GAIN.diagonal()
def AccumulateDeriv(self, edge, deriv): """Accumulate the derivative w.r.t the outputs of this layer. A layer needs to compute derivatives w.r.t its outputs. These outputs may have been connected to lots of other nodes through outgoing edges. This method adds up the derivatives contributed by each outgoing edge. It gets derivatives w.r.t the inputs at the other end of its outgoing edge. Args: edge: The edge which is sending the derivative. deriv: The derivative w.r.t the inputs at the other end of this edge. """ if self.is_input: return if self.dirty: # If some derivatives have already been received. self.deriv.add_dot(edge.params["weight"], deriv) else: # Receiving derivative for the first time. cm.dot(edge.params["weight"], deriv, target=self.deriv) self.dirty = True
def ComputeGradientGPU(self, raaE): # For each layer... for iLayer in range(self.iLayers-1,-1,-1): # Measure the layer input # (iSamples, iFeatures) = self.oaStates[iLayer].raaX.shape (iFeatures, iSamples) = self.oaStates[iLayer].raaX.shape # Compute the gradient of error with respect to weight # self.oaStates[iLayer].raaWg = numpy.dot(self.oaStates[iLayer].raaX.T, raaE) self.oaStates[iLayer].raaWg = cudamat.dot(self.oaStates[iLayer].raaX, raaE.T) # Compute gradient of error with respect to bias # self.oaStates[iLayer].raBg = numpy.sum(raaE,0) self.oaStates[iLayer].raBg = raaE.sum(1) # If error is needed for next layer... if(iLayer>0): # Backpropagate the error # raaE = numpy.dot(raaE,self.oaLayers[iLayer].raaW.T) raaE = cudamat.dot(self.oaLayers[iLayer].raaW.T, raaE) # Compute the sample count for prior layer # iSamples = raaE.shape[0]*self.oaLayers[iLayer].iDecimation iSamples = raaE.shape[1]*self.oaLayers[iLayer].iDecimation # Undecimate error # raaE = numpy.reshape(raaE,(iSamples,-1)) iSize = numpy.prod(raaE.shape) iN = iSize//iSamples raaE.reshape((iN,iSamples)) # Compute deferred hadamard product with derivative so shapes match # raaE = raaE*self.oaStates[iLayer].raaD raaE.mult(self.oaStates[iLayer].raaD) # Get the serialized gradient vector raG = self.GetGradientVector() # Return gradient and error metrics # return((raG, rError, rRmse)) return(raG)
def pairwiseEuclideanGPU(a, b, returnAsGPU=False, squared=False): """ Compute the pairwise euclidean distance between matrices a and b. Parameters ---------- a : np.ndarray (n, f) first matrice b : np.ndarray (m, f) second matrice returnAsGPU : boolean, optional (default False) if True, returns cudamat matrix still on GPU, else return np.ndarray squared : boolean, optional (default False) if True, return squared euclidean distance matrice Returns ------- c : (n x m) np.ndarray or cudamat.CUDAMatrix pairwise euclidean distance distance matrix """ # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m). # First compute in c_GPU the squared euclidean distance. And return its # square root. At each cell [i,j] of c, we want to have # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c: # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2). a_GPU = cudamat.CUDAMatrix(a) b_GPU = cudamat.CUDAMatrix(b) # Multiply a by b transpose to obtain in each cell [i,j] of c the # value sum{k in range(f)} ( a[i,k]b[j,k] ) c_GPU = cudamat.dot(a_GPU, b_GPU.transpose()) # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] ) c_GPU.mult(-2) # Compute the vectors of the sum of squared elements. a_GPU = cudamat.pow(a_GPU, 2).sum(axis=1) b_GPU = cudamat.pow(b_GPU, 2).sum(axis=1) # Add the vectors in each columns (respectivly rows) of c. # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] ) c_GPU.add_col_vec(a_GPU) # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2) c_GPU.add_row_vec(b_GPU.transpose()) if not squared: c_GPU = cudamat.sqrt(c_GPU) if returnAsGPU: return c_GPU else: return c_GPU.asarray()
def backProp(self, error): # print 'back propagation' self.dW[self.H-1].add_dot(self.vis[self.H-1],error.T) # print 'self.vis' # self.vis[self.H-1].copy_to_host() # print self.vis[self.H-1].numpy_array # print 'self.dW' # self.dW[self.H-1].copy_to_host() # print self.dW[self.H-1].numpy_array # print 'error 2' # error.copy_to_host() # print error.numpy_array self.db[self.H-1].add_sums(error,axis =1 ) for i in list(reversed(range(self.H-1))): delta = cm.empty((self.W[i+1].shape[0],error.shape[1])) cm.dot(self.W[i+1],error,target = delta)# delta : 2000*256 learn.mult_by_sigmoid_deriv(delta, self.vis[i+1]) self.dW[i].add_dot(self.vis[i], delta.T) self.db[i].add_sums(delta, axis = 1) error = delta
def hidActProbs(self, targ = None, vis = None): """ targ had better be on the gpu or None """ if targ == None: targ = self.hActProbs if vis == None: vis = self.vis #recall that self.acceleration calls self.hidActProbs normalizeInputData(vis, self.tempVisMB, self.sqColLens, self.tempRow, self.normalizedVisMB) #cm.dot(self.visToFact.T, vis, target = self.factResponses) cm.dot(self.visToFact.T, self.normalizedVisMB, target = self.factResponses) self.factResponses.mult(self.factResponses, target = self.factResponsesSq) cm.dot(self.factToHid.T, self.factResponsesSq, target = targ) targ.add_col_vec(self.hidBias) self.hNetInputs.assign(targ) #needed later in Hamiltonian computation targ.apply_sigmoid()
def test_dot(): m = 128 k = 256 n = 64 a = np.array(np.random.randn(m, k) * 10, dtype=np.float32, order='F') b = np.array(np.random.randn(k, n) * 10, dtype=np.float32, order='F') c = np.array(np.random.randn(m, n) * 10, dtype=np.float32, order='F') alpha = 2. beta = 0.3 r = beta * c + alpha * np.dot(a, b) m1 = cm.CUDAMatrix(a) m2 = cm.CUDAMatrix(b) m3 = cm.CUDAMatrix(c) m3 = cm.dot(m1, m2, target=m3, alpha=alpha, beta=beta) m3.copy_to_host() assert np.max(np.abs(r - m3.numpy_array) ) < 10**-2, "Error in CUDAMatrix.dot exceeded threshold"
def Test(): A = np.float32(np.random.randn(*(2000, 2000))) A = np.complex64(np.ones((2000, 2000)) + 1j * np.ones((2000, 2000))) AT = A.T.copy() A_32 = A #np.float32(A) AT_32 = AT #np.float32(AT) T = ClassTimeIt.ClassTimeIt() # create two random matrices and copy them to the GPU g_A0 = cm.CUDAMatrix(A) g_AT0 = cm.CUDAMatrix(AT) # perform calculations on the GPU P0 = cm.dot(g_AT0, g_A0).asarray() #d = cm.sum(axis = 0) T.timeit("GPU0") del (g_AT0, g_A0) #T.reinit() # copy d back to the host (CPU) and print g_A1 = gpuarray.to_gpu(A) g_AT1 = gpuarray.to_gpu(AT) #time.sleep(5) #T.timeit("tranf0") g_P1 = culinalg.dot(g_AT1, g_A1) P1 = g_P1.get() #T.timeit("tranf1") T.timeit("GPU1") np_P = np.dot(AT, A) T.timeit("np") #print g_P-np_P print(np.max(np_P - P0)) print(np.max(np_P - P1))
def search(indices, feat, feature_map, ID_map): feature_table = None ID_table = None indices = indices[0][0:2] #print indices #print feature_map[0][0],ID_map[0][0:4] for category in indices: if feature_table is None: #print category if (feature_map[category]): feature_table = np.copy(feature_map[category]) if (ID_map[category]): ID_table = np.copy(ID_map[category]) else: if (feature_map[category]): feature_table = np.vstack( (feature_table, feature_map[category])) if (ID_map[category]): ID_table = np.hstack((ID_table, ID_map[category])) #print feature_table[1] a = cm.CUDAMatrix(feat) #print feat c = cm.CUDAMatrix(feature_table) d = cm.dot(c, a) e = d.asarray() #print e ind = np.argsort(-e, axis=0) ind = ind[0:100] #print ind ID_result = ID_table[ind] ''' for index in ind: if ID_result is None: ID_result = np.copy(ID_map[index]) else: ID_result = np.hstack((ID_result, ID_map[index])) ''' return ID_result
def setVariables(self): n, m, r = self.n, self.m, self.rank self.G_gpu = cm.CUDAMatrix(self.G) self.W_gpu = cm.CUDAMatrix(self.W) self.X_gpu = cm.CUDAMatrix(self.X) self.XTX_gpu= cm.dot(self.X_gpu.T, self.X_gpu) self.XTXpos_gpu = cm.empty((m,m)) self.XTX_gpu.greater_than(0, target=self.XTXpos_gpu) self.XTXpos_gpu.mult(self.XTX_gpu) self.XTXneg_gpu = cm.empty((m,m)) self.XTXpos_gpu.subtract(self.XTX_gpu, target=self.XTXneg_gpu) self.XTXnegW_gpu = cm.empty((m,r)) self.XTXposW_gpu = cm.empty((m,r)) self.GWT_gpu = cm.empty((m,m)) self.update1_gpu = cm.empty((m,r)) self.update2_gpu = cm.empty((m,r)) self.GTG_gpu = cm.empty((r,r)) self.XTXnegG_gpu = cm.empty((m,r)) self.XTXposG_gpu = cm.empty((m,r))
def feed_forward(self,input=None): #optionally allow passing input as an argument if input is not None: self.input = input for index,l in enumerate(self.layer): if(index == 0): input = self.input else: input = self.layer[index-1].output l.input = input #print(str(index) + " " + str(l.weights.shape) + " " + str(l.input.shape)) l.weighted_sums = cm.dot(l.weights,l.input) #apply activation function if(l.activation == 'squash'): pass #TODO: write kernal for this #l.output = l.weighted_sums / (1+np.abs(l.weighted_sums)) elif(l.activation == 'sigmoid'): l.output = l.weighted_sums.apply_sigmoid() #elif(l.activation == 'linear_rectifier'): # l.output = np.maximum(0,l.weighted_sums) else: #base case is linear l.output = l.weighted_sums #if(l.dropout is not None and self.train == True): # if(l.dropout == 0.5): # l.output = l.output*np.random.randint(0,2,l.output.shape); # else: # l.output = l.output*np.random.binomial(1,l.dropout,l.output.shape); #elif(l.dropout is not None and self.train == False): # l.output = l.output*(1.0 - l.dropout); self.output = self.layer[len(self.layer)-1].output self.output.copy_to_host() self.output = self.output.numpy_array self.output = self.output[0:-1,:]
def test_T_field(): m = 256 n = 128 cm1 = np.array(np.random.rand(n, m) * 10, dtype=np.float32, order='F') cm2 = np.array(np.random.rand(m, 1) * 10, dtype=np.float32, order='F') gm1 = cm.CUDAMatrix(cm1) gm2 = cm.CUDAMatrix(cm2) # test dot gm = cm.dot(gm2.T, gm1.T) c = np.dot(cm2.T, cm1.T) gm.copy_to_host() assert np.max( np.abs(gm.numpy_array - c) ) < 10**-2, "Error in CUDAMatrix.dot with TransposedCUDAMatrix exceeded threshold" # test add_dot cm3 = np.array(np.random.rand(1, n) * 10, dtype=np.float32, order='F') gm3 = cm.CUDAMatrix(cm3) gm3.add_dot(gm2.T, gm1.T) c = cm3 + np.dot(cm2.T, cm1.T) gm3.copy_to_host() assert np.max( np.abs(gm3.numpy_array - c) ) < 10**-2, "Error in CUDAMatrix.add_dot TransposedCUDAMatrix exceeded threshold" # test add_sums gm2.add_sums(gm1.T, axis=1) c = cm2 + np.atleast_2d(cm1.sum(0)).T gm2.copy_to_host() assert np.max( np.abs(gm2.numpy_array - c) ) < 10**-2, "Error in CUDAMatrix.add_sums TransposedCUDAMatrix exceeded threshold"
def compute_energy_mcRBM_visual(self, data, normdata, energy, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, t1, t2, t6, feat, featsq, feat_mean, length, lengthsq, normcoeff, small, num_vis): # normalize input data vectors data.mult(data, target=t6) # DxP (nr input dims x nr samples) t6.sum(axis=0, target=lengthsq) # 1xP lengthsq.mult(0.5, target=energy) # energy of quadratic regularization term lengthsq.mult(1. / num_vis) # normalize by number of components (like std) lengthsq.add(small) # small prevents division by 0 cmt.sqrt(lengthsq, target=length) length.reciprocal(target=normcoeff) # 1xP data.mult_by_row(normcoeff, target=normdata) # normalized data ## potential # covariance contribution cmt.dot(VF.T, normdata, target=feat) # HxP (nr factors x nr samples) feat.mult(feat, target=featsq) # HxP cmt.dot(FH.T, featsq, target=t1) # OxP (nr cov hiddens x nr samples) t1.mult(-0.5) t1.add_col_vec(bias_cov) # OxP cmt.exp(t1) # OxP t1.add(1, target=t2) # OxP cmt.log(t2) t2.mult(-1) energy.add_sums(t2, axis=0) # mean contribution cmt.dot(w_mean.T, data, target=feat_mean) # HxP (nr mean hiddens x nr samples) feat_mean.add_col_vec(bias_mean) # HxP cmt.exp(feat_mean) feat_mean.add(1) cmt.log(feat_mean) feat_mean.mult(-1) energy.add_sums(feat_mean, axis=0) # visible bias term data.mult_by_col(bias_vis, target=t6) t6.mult(-1) # DxP energy.add_sums(t6, axis=0) # 1xP # kinetic data.mult(data, target=t6) energy.add_sums(t6, axis=0, mult=.5)
def compute_output(self, gpu_data): """ Computes p(y|x). Puts the result in self.gpu_p_y_given_x. """ cm.dot(self.W, gpu_data, self.gpu_act_from_x) self.gpu_act_from_x.add_col_vec(self.c) for c in range(self.n_classes): cm.dot(self.U, self.gpu_target_vectors.slice(c, c + 1), self.gpu_act_from_y) # to avoid memory creation, using gpu_h # and gpu_h_sample for these computations self.gpu_act_from_x.add_col_vec(self.gpu_act_from_y, target=self.gpu_h) cm.exp(self.gpu_h, self.gpu_h_sample) self.gpu_h_sample.add(1.) cm.log(self.gpu_h_sample, self.gpu_h) self.gpu_h.sum(axis=0, target=self.gpu_negative_free_energy_for_y) cm.dot(self.d.T, self.gpu_target_vectors.slice(c, c + 1), target=self.gpu_bias_from_y) self.gpu_negative_free_energy_for_y.add_col_vec( self.gpu_bias_from_y) self.gpu_negative_free_energy_for_y.transpose( target=self.gpu_negative_free_energy.slice(c, c + 1)) # Subtracting mean for more stable softmax computation self.gpu_negative_free_energy.sum( axis=1, target=self.gpu_mean_negative_free_energy) self.gpu_mean_negative_free_energy.divide(-self.n_classes) self.gpu_negative_free_energy.add_col_vec( self.gpu_mean_negative_free_energy) cm.exp(self.gpu_negative_free_energy, target=self.gpu_negative_free_energy) self.gpu_negative_free_energy.sum(axis=1, target=self.gpu_p_y_given_x_norm) for c in range(self.n_classes): self.gpu_negative_free_energy.slice(c, c + 1).divide( self.gpu_p_y_given_x_norm, target=self.gpu_p_y_given_x.slice(c, c + 1)) self.gpu_p_y_given_x.transpose(target=self.gpu_p_y_given_x_trans)
def ff(x0_cpu): data_size = x0_cpu.shape[1] x_l0 = cm.empty((num_input, data_size)) x_l0.assign(cm.CUDAMatrix(x0_cpu)) x_l1 = cm.empty((num_hid, data_size)) cm.dot(w1.T, x_l0, target=x_l1) x_l1.add_col_vec(b1) x_l1.apply_sigmoid() x_l2 = cm.empty((num_hid, data_size)) del x_l0 cm.dot(w2.T, x_l1, target=x_l2) x_l2.add_col_vec(b2) x_l2.apply_sigmoid() x_l3 = cm.empty((num_hid, data_size)) del x_l1 cm.dot(w3.T, x_l2, target=x_l3) x_l3.add_col_vec(b3) x_l3.apply_sigmoid() x_l4 = cm.empty((num_hid, data_size)) del x_l2 cm.dot(w4.T, x_l3, target=x_l4) x_l4.add_col_vec(b4) x_l4.apply_sigmoid() x_l5 = cm.empty((num_hid, data_size)) del x_l3 cm.dot(w5.T, x_l4, target=x_l5) x_l5.add_col_vec(b5) x_l5.apply_sigmoid() x_output = cm.empty((num_output, data_size)) del x_l4 tmp_x_output = cm.empty((num_output, data_size)) tmp_x_output_sums = cm.empty((1, data_size)) cm.dot(wo.T, x_l5, target=tmp_x_output) tmp_x_output.add_col_vec(bo) cm.exp(tmp_x_output) tmp_x_output.sum(axis=0, target=tmp_x_output_sums) tmp_x_output_sums.reciprocal() tmp_x_output.mult_by_row(tmp_x_output_sums) x_output.assign(tmp_x_output) x_output.mult_by_col(state_prior_gpu_rec) cm.log(x_output) x_output.mult(1. / np.log(10)) xo = x_output.asarray() return xo
import cudamat as cm import numpy as np cm.cuda_set_device(0) cm.cublas_init() t = np.load('/home/scw4750/frelam_20161027/get_feature/data/feature_0w-5w.npy') t.dtype = '<f' feat = t[0:40000] print t a = cm.CUDAMatrix(feat) c = cm.dot(a, a.T) e = cm.sqrt(c) e = e.asarray() #e.dtype = 'float' print len(e) dioa = None for index, item in enumerate(e): if dioa is None: temp = np.array(item[index]) dioa = np.copy(temp) else: temp = np.array(item[index]) dioa = np.vstack((dioa, temp)) feat = t[40000:50000] a = cm.CUDAMatrix(feat) c = cm.dot(a, a.T) e_2 = cm.sqrt(c) e_2 = e_2.asarray() print len(e_2) for index, item in enumerate(e_2):
def heatup(duration): """Heat-up the GPU for a while so it enters full-performance mode""" t1 = time.time() while time.time() - t1 < duration: cmt.dot(cmt.empty((200, 200)), cmt.empty((200, 200)))
def bench_dot(X, Y, col, row): cmt.dot(X.T, Y)
def visActProbs(self): cm.dot(self.visToHid, self.hActs, target=self.negVis) self.negVis.add_col_vec(self.visBias)
def run(self, iterations): for i in range(0,iterations): # update H cm.dot(self.W_gpu.T, self.X_gpu, target=self.WTX_gpu) cm.dot(self.W_gpu.T, self.W_gpu, target=self.WTW_gpu) cm.dot(self.WTW_gpu, self.H_gpu, target=self.WTWH_gpu) self.H_gpu.mult(self.WTX_gpu).divide(self.WTWH_gpu) # update W cm.dot(self.X_gpu, self.H_gpu.T, target=self.XHT_gpu) cm.dot(self.W_gpu, self.H_gpu, target=self.WH_gpu) cm.dot(self.WH_gpu, self.H_gpu.T, target=self.WHHT_gpu) self.W_gpu.mult(self.XHT_gpu).divide(self.WHHT_gpu) # test for convergence if (i % self.niter_test_conv == 0) and self.checkConvergence(): print "NMF converged after %i iterations" % i break
def run(self, iterations): for i in range(0,iterations): # F = XG(G.T G)^-1 cm.dot(self.G_gpu.T, self.G_gpu, target=self.GTG_gpu) try: self.GTGpinv_gpu = cm.CUDAMatrix(np.linalg.inv( self.GTG_gpu.asarray())) except LinAlgError: self.GTGpinv_gpu = cm.CUDAMatrix(np.linalg.pinv( self.GTG_gpu.asarray())) cm.dot(self.X_gpu, self.G_gpu, target=self.XG_gpu) cm.dot(self.XG_gpu, self.GTGpinv_gpu, target=self.F_gpu) # preparation and calculation of the matrix separations cm.dot(self.X_gpu.T, self.F_gpu, target=self.XTF_gpu) cm.dot(self.F_gpu.T, self.F_gpu, target=self.FTF_gpu) self.XTF_gpu.greater_than(0, target=self.XTFgreater_gpu) self.XTF_gpu.mult(self.XTFgreater_gpu, target=self.XTFpos_gpu) self.XTFpos_gpu.subtract(self.XTF_gpu, target=self.XTFneg_gpu) self.FTF_gpu.greater_than(0, target=self.FTFgreater_gpu) self.FTF_gpu.mult(self.FTFgreater_gpu, target=self.FTFpos_gpu) self.FTFpos_gpu.subtract(self.FTF_gpu, target=self.FTFneg_gpu) # compute the G update cm.dot(self.G_gpu, self.FTFpos_gpu, target=self.GFTFpos_gpu) cm.dot(self.G_gpu, self.FTFneg_gpu, target=self.GFTFneg_gpu) self.XTFpos_gpu.add(self.GFTFneg_gpu) self.XTFneg_gpu.add(self.GFTFpos_gpu) self.XTFpos_gpu.add_scalar(10**-9) self.XTFneg_gpu.add_scalar(10**-9) self.XTFpos_gpu.divide(self.XTFneg_gpu) cm.sqrt(self.XTFpos_gpu) self.G_gpu.mult(self.XTFpos_gpu) # test for convergence if (i % self.niter_test_conv == 0) and self.checkConvergence(): print "NMF converged after %i iterations" % i break
def get_gradient(self, x, n_updates=1): """Use Gibbs sampling to estimate the contrastive divergence gradient. - x: a cuda matrix having different variables on different columns and observations on the rows (context) - n_updates: number of CD iterations. Default value: 1 Returns a tuple (dw, dbv, dbh, da, db) that contains the gradients of the weights and the biases of the visibles and the hidden respectively and the autoregressive gradients da and db. This is not the true gradient anymore as I didn't explicitly divide by n for the gradients that are based on sums over n datapoints. The BPTT gradient with respect to the reservoir recurrent and input weight is computed as well. """ # useful quantities n = x.shape[0] w, a, b, bv, bh = self.wg, self.ag, self.bg, self.bvg, self.bhg # Pre-calculate dynamic biases. dynamic_h = cm.empty((n, self.output_dim)) dynamic_v = cm.empty((n, self.visible_dim)) cm.dot(x, self.ag, dynamic_v) cm.dot(x, self.bg, dynamic_h) # first update of the hidden units for the data term self._sample_h(self.v, dynamic_h, sample=False, x_is_bias=True) # n updates of both v and h for the model term self.h_data = cm.empty(self.h.shape) self.v_data = cm.empty(self.v.shape) self.h_data.assign(self.h) self.v_data.assign(self.v) #self._sample_h(self.v, dynamic_h, sample=True, x_is_bias=True) for i in range(n_updates): self._stochastic_h() self._sample_v(self.h, dynamic_v, x_is_bias=True) self._sample_h(self.v, dynamic_h, sample=False, x_is_bias=True) # Is preallocating really that "bad" for for example data_term? # find dw dw = cm.empty(self.w.shape) cm.dot(self.v_data.T, self.h_data, dw) dw.subtract_dot(self.v.T, self.h) # find da d_v = cm.empty(self.v.shape) # TODO: perhaps this is inefficient... da = cm.empty(self.a.shape) self.v_data.subtract(self.v, d_v) cm.dot(x.T, d_v, da) # find db d_h = cm.empty(self.h.shape) # TODO: perhaps this is inefficient... # TODO: I should probably just compute the gradient with respect to the # biases once and use that for both updating matrix b and the biases # itself. db = cm.empty(self.b.shape) self.h_data.subtract(self.h, d_h) cm.dot(x.T, d_h, db) # find dbv dbv = cm.empty((1, self.visible_dim)) self.v_data.sum(axis=0, target=dbv) dbv.add_sums(self.v, axis=0, mult=-1.0) # Subtract sum # find dbh dbh = cm.empty((1, self.output_dim)) self.h_data.sum(axis=0, target=dbh) dbh.add_sums(self.h, axis=0, mult=-1.0) # Subtract sum #### BPTT code #### # TODO: Some of the computations above should be combined with the # gradient calculation here. d_reservoir = cm.empty((self.context_dim, n)) # Do some transposes because get_col_slice is faster than get_row_slice. x_T = x.transpose() d_h_T = d_h.transpose() d_v_T = d_v.transpose() # Pre-calculate the tanh derivatives dtanh = cm.empty(x_T.shape) x_T.apply_dtanh(target=dtanh) # Last state gets no gradient information from the future drt = d_reservoir.get_col_slice(n - 1, n) drt.assign(0) # Main BPTT loop for i in range(n - 1, 0, -1): drt = d_reservoir.get_col_slice(i, i + 1) dr_pre_t = d_reservoir.get_col_slice(i - 1, i) d_vt = d_v_T.get_col_slice(i, i + 1) d_ht = d_h_T.get_col_slice(i, i + 1) # Add visible component # TODO: I could actually pre-calculate this outside the loop drt.add_dot(self.ag, d_vt) # Add hidden component drt.add_dot(self.bg, d_ht) # Mult with derivative drt.mult(dtanh.get_col_slice(i, i + 1)) # Backpropagate cm.dot(self.reservoir.w.T, drt, dr_pre_t) d_vt = d_v_T.get_col_slice(0, 1) d_ht = d_h_T.get_col_slice(0, 1) dr_pre_t = d_reservoir.get_col_slice(0, 1) # Add visible component dr_pre_t.add_dot(self.ag, d_vt) # Add hidden component dr_pre_t.add_dot(self.bg, d_ht) # Mult with derivative dr_pre_t.mult(dtanh.get_col_slice(0, 1)) # Compute weight derivatives dw_res = cm.empty(self.reservoir.w.shape) dw_res_in = cm.empty(self.reservoir.w_in.shape) # dw_res <- d_reservoir * x(t-1) # The first state has obviously no previous state so we can ignore it. cm.dot(d_reservoir.get_col_slice(1, n), x_T.get_col_slice(0, n - 1).T, target=dw_res) # dw_res_in <- d_reservoir * v cm.dot(d_reservoir, self.v_data, target=dw_res_in) ################### return (dw, dbv, dbh, da, db, dw_res, dw_res_in)
def get_CD_gradient(self, x, n_updates=1): """Use Gibbs sampling to estimate the contrastive divergence gradient. - x: a cuda matrix having different variables on different columns and observations on the rows (context) - n_updates: number of CD iterations. Default value: 1 Returns a tuple (dw, dbv, dbh, da, db) that contains the gradients of the weights and the biases of the visibles and the hidden respectively and the autoregressive gradients da and db. This is not the true gradient anymore as I didn't explicitly divide by n for the gradients that are based on sums over n datapoints. """ # useful quantities n = x.shape[0] w, a, b, bv, bh = self.wg, self.ag, self.bg, self.bvg, self.bhg # Pre-calculate dynamic biases. dynamic_h = cm.empty((n, self.output_dim)) dynamic_v = cm.empty((n, self.visible_dim)) cm.dot(x, self.ag, dynamic_v) cm.dot(x, self.bg, dynamic_h) # first update of the hidden units for the data term self._sample_h(self.v, dynamic_h, sample=False, x_is_bias=True) # n updates of both v and h for the model term # TODO: I set things back to sutskever's way of sampling but should # really compare it to Ben's method some time. self.h_data = cm.empty(self.h.shape) self.v_data = cm.empty(self.v.shape) self.h_data.assign(self.h) self.v_data.assign(self.v) for i in range(n_updates): self._stochastic_h() self._sample_v(self.h, dynamic_v, x_is_bias=True) self._sample_h(self.v, dynamic_h, sample=False, x_is_bias=True) # Is preallocating really that "bad" for for example data_term? # find dw dw = cm.empty(self.w.shape) cm.dot(self.v_data.T, self.h_data, dw) dw.subtract_dot(self.v.T, self.h) # find da temp = cm.empty(self.v.shape) # TODO: perhaps this is inefficient... da = cm.empty(self.a.shape) self.v_data.subtract(self.v, temp) cm.dot(x.T, temp, da) # find db temp = cm.empty(self.h.shape) # TODO: perhaps this is inefficient... db = cm.empty(self.b.shape) self.h_data.subtract(self.h, temp) cm.dot(x.T, temp, db) # find dbv dbv = cm.empty((1, self.visible_dim)) self.v_data.sum(axis=0, target=dbv) dbv.add_sums(self.v, axis=0, mult=-1.0) # Subtract sum # find dbh dbh = cm.empty((1, self.output_dim)) self.h_data.sum(axis=0, target=dbh) dbh.add_sums(self.h, axis=0, mult=-1.0) # Subtract sum return (dw, dbv, dbh, da, db)
for epoch in range(epochs): for xt, yt in mdp.utils.progressinfo(zip(x[0:n_train_samples], y[0:n_train_samples])): batch_size = xt.shape[0] / 2 state = crbm.reservoir.simulate(cm.CUDAMatrix(xt)) crbm.v = cm.CUDAMatrix(xt[:batch_size, :]) crbm.train(state.get_row_slice(0, batch_size), decay=0, epsilon=.001, momentum=.9) crbm.v = cm.CUDAMatrix(xt[batch_size:, :]) crbm.train(state.get_row_slice(batch_size, xt.shape[0]), decay=0, epsilon=.001, momentum=.9) print 'epoch', epoch, 'finished' error = 0 for xt, yt in mdp.utils.progressinfo(zip(x[0:n_train_samples], y[0:n_train_samples])): state = crbm.reservoir.simulate(cm.CUDAMatrix(xt)) v = cm.CUDAMatrix(sp.random.normal(0, 1, (xt.shape))) crbm.v = v n = xt.shape[0] dynamic_h = cm.empty((n, crbm.output_dim)) dynamic_v = cm.empty((n, crbm.visible_dim)) cm.dot(state, crbm.ag, dynamic_v) cm.dot(state, crbm.bg, dynamic_h) for i in range(25): crbm._sample_h(crbm.v, dynamic_h, sample=True, x_is_bias=True) crbm._sample_v(crbm.h, dynamic_v, x_is_bias=True) error += sp.mean((crbm.v.asarray() - xt) ** 2) print error / n_train_samples # Evaluate reconstruction error
def encode(self): cm.dot(self.visToHid.T, self.inp, target = self.hid)
def decode(self): cm.dot(self.hidToVis.T, self.hid, target = self.out)
import numpy as np import cudamat as cm cm.cublas_init() # create two random matrices and copy them to the GPU a = cm.CUDAMatrix(np.random.rand(32, 256)) b = cm.CUDAMatrix(np.random.rand(256, 32)) # perform calculations on the GPU c = cm.dot(a, b) d = c.sum(axis = 0) # copy d back to the host (CPU) and print print d.asarray()
def forward_prop(self, x, thetas): num_thetas = len(thetas) # add ones to end cm.dot(x, self.layer_expand_mask[0], self.activ_layers_temp[0]) cm.dot(self.activ_layers[0], self.clear_vec, self.activ_layers[0]) self.activ_layers[0].add(self.activ_layers_temp[0]) cm.dot(self.activ_layers[0], thetas[0].T, self.z[1]) for i in range(1, num_thetas): cm.dot(self.z[i].apply_sigmoid(), self.layer_expand_mask[i], self.activ_layers_temp[i]) cm.dot(self.activ_layers[i], self.clear_vec2, self.activ_layers[i]) self.activ_layers[i].add(self.activ_layers_temp[i]) cm.dot(self.activ_layers[i], thetas[i].T, self.z[i + 1]) self.z[num_thetas].apply_sigmoid(self.activ_layers[num_thetas]) #print self.activ_layers[num_thetas].asarray() return self.activ_layers[num_thetas], self.activ_layers
W1 = np.array(W1, order="F") #a = np.array(a, order="C") W2=hfl_data['W2'] #W2 = np.array(W2, order="F") W3=hfl_data['W3'] #W3 = np.array(W3, order="F") #a=cm.CUDAMatrix(feat_mat) b=cm.CUDAMatrix(W1.T) a1=cm.dot(a.transpose(),b) #a1=cm.dot(a,b) a1.mult(1.7159) a1.mult(2.0/3.0) a1=a1.asarray() a1=np.tanh(a1) a1=a1*0.5 # accounting for droput a1=np.c_[np.ones(num_candidates),a1] #adding bias a=cm.CUDAMatrix(a1.T) b=cm.CUDAMatrix(W2)
def _calculate_moments_ns(self, x, ws, quick=False): """Calculate moments based on the weights and samples. We also calculate and save MI, TC, additivity, and the value of the objective. Note it is assumed that <X_i^2> = 1! """ m = {} # Dictionary of moments eps = 10**-8 if self.gpu: y = cm.empty((self.n_samples, self.m)) wc = cm.CUDAMatrix(ws) cm.dot(x, wc.T, target=y) # + noise, but it is included analytically del wc tmp_sum = np.einsum( 'lj,lj->j', y.asarray(), y.asarray()) # TODO: Should be able to do on gpu... else: y = x.dot(ws.T) tmp_sum = np.einsum('lj,lj->j', y, y) m["uj"] = ( 1 - self.eps**2) * tmp_sum / self.n_samples + self.eps**2 * np.sum( ws**2, axis=1) #if quick and np.max(m["uj"]) >= 1.: # return False if self.gpu: tmp = cm.empty((self.nv, self.m)) cm.dot(x.T, y, target=tmp) tmp_dot = tmp.asarray() del tmp del y else: tmp_dot = x.T.dot(y) m["rho"] = ( 1 - self.eps** 2) * tmp_dot.T / self.n_samples + self.eps**2 * ws # m by nv m["ry"] = ws.dot(m["rho"].T) # normalized covariance of Y m["Y_j^2"] = self.yscale**2 / (1. - m["uj"] + eps) np.fill_diagonal(m["ry"], 1) m["invrho"] = 1. / (1. - m["rho"]**2 + eps) m["rhoinvrho"] = m["rho"] * m["invrho"] m["Qij"] = np.dot(m['ry'], m["rhoinvrho"]) m["Qi"] = np.einsum('ki,ki->i', m["rhoinvrho"], m["Qij"]) #m["Qi-Si^2"] = np.einsum('ki,ki->i', m["rhoinvrho"], m["Qij"]) m["Si"] = np.sum(m["rho"] * m["rhoinvrho"], axis=0) # This is the objective, a lower bound for TC m["TC"] = np.sum(np.log(1 + m["Si"])) \ - 0.5 * np.sum(np.log(1 - m["Si"]**2 + m["Qi"]+eps)) \ + 0.5 * np.sum(np.log(1 - m["uj"]+eps)) if not quick: m["MI"] = -0.5 * np.log1p(-m["rho"]**2) m["X_i Y_j"] = m["rho"].T * np.sqrt(m["Y_j^2"]) m["X_i Z_j"] = np.linalg.solve(m["ry"], m["rho"]).T m["X_i^2 | Y"] = ( 1. - np.einsum('ij,ji->i', m["X_i Z_j"], m["rho"])).clip(1e-6) m['I(Y_j ; X)'] = 0.5 * np.log(m["Y_j^2"]) - 0.5 * np.log( self.yscale**2) m['I(X_i ; Y)'] = -0.5 * np.log(m["X_i^2 | Y"]) m["TCs"] = m["MI"].sum(axis=1) - m['I(Y_j ; X)'] m["TC_no_overlap"] = m["MI"].max(axis=0).sum( ) - m['I(Y_j ; X)'].sum( ) # A direct calculation of TC where each variable is in exactly one group. m["TC_direct"] = m['I(X_i ; Y)'].sum() - m[ 'I(Y_j ; X)'] # A direct calculation of TC. Should be upper bound for "TC", "TC_no_overlap" m["additivity"] = (m["MI"].sum(axis=0) - m['I(X_i ; Y)']).sum() return m
def getW(self): return(cm.dot(self.X_gpu, self.W_gpu).asarray())
def costAndGrad(self, data, labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt, _ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt, _ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w, b in stack: cm.dot(w, self.hActs[i - 1], self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1, T): self.hActs[i].minmax(0.0, self.maxAct, col=t - 1) cm.mvdot_col_slice(wt, self.hActs[i], t - 1, self.hActs[i], t, beta=1.0) self.hActs[i].minmax(0.0, self.maxAct, col=T - 1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0, target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0, target=self.rowVec) cm.pow(self.rowVec, -1.0, target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path( self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype( np.float64), labels, blank=0) if skip: return cost, self.grad, skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn, deltasOut = self.deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0]) deltasIn.sum(axis=1, target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T, deltasIn, target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T - 1, 0, -1): # Add in temporal delta cm.mvdot_col_slice(wt.T, self.deltaTemp, t, deltasOut, t, beta=1.0) # Push through activation fn deltasOut.mult_slice(t, self.tmpGrad, t) self.deltaTemp.set_single_col(t - 1, deltasOut, t) # Accumulate temporal gradient cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T, self.deltaTemp, 0, deltasOut, 0, beta=1.0) deltasOut.mult_slice(0, self.tmpGrad, 0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 return cost, self.grad, skip
def run(self, iterations): for i in range(0,iterations): cm.dot(self.XTXneg_gpu, self.W_gpu, target=self.XTXnegW_gpu) cm.dot(self.XTXpos_gpu, self.W_gpu, target=self.XTXposW_gpu) # Update G cm.dot(self.G_gpu, self.W_gpu.T, target=self.GWT_gpu) # G *= np.sqrt((XTXposW + np.dot(GWT, XTXnegW)) # /(XTXnegW+np.dot(GWT, XTXposW))) cm.dot(self.GWT_gpu, self.XTXnegW_gpu, target=self.update1_gpu) cm.dot(self.GWT_gpu, self.XTXposW_gpu, target=self.update2_gpu) self.update1_gpu.add(self.XTXposW_gpu) self.update2_gpu.add(self.XTXnegW_gpu) self.update2_gpu.add_scalar(10**-9) self.update1_gpu.divide(self.update2_gpu) cm.sqrt(self.update1_gpu) self.G_gpu.mult(self.update1_gpu) # Update W cm.dot(self.G_gpu.T, self.G_gpu, target=self.GTG_gpu) #W *= np.sqrt((np.dot(XTXpos, G) + np.dot(XTXnegW, GTG)) # / (np.dot(XTXneg, G) # + np.dot(XTXposW, GTG))) cm.dot(self.XTXpos_gpu, self.G_gpu, target=self.XTXposG_gpu) cm.dot(self.XTXneg_gpu, self.G_gpu, target=self.XTXnegG_gpu) cm.dot(self.XTXnegW_gpu, self.GTG_gpu, target=self.update1_gpu) cm.dot(self.XTXposW_gpu, self.GTG_gpu, target=self.update2_gpu) self.update1_gpu.add(self.XTXposG_gpu) self.update2_gpu.add(self.XTXnegG_gpu) self.update2_gpu.add_scalar(10**-9) self.update1_gpu.divide(self.update2_gpu) cm.sqrt(self.update1_gpu) self.W_gpu.mult(self.update1_gpu) # test for convergence if (i % self.niter_test_conv == 0) and self.checkConvergence(): print "NMF converged after %i iterations" % i break
def train(self): ''' Main train function : modified version of the original train function. Additions : GPU selection (useful for multi-GPU machines) Saving the sum of the square of the data for post-processing Visible data are saved Data samples are permuted for training Weights are saved every 100 training epochs Training energy is visualized every 100 training epochs NOTE : anneal learning rate used in the initial code, is NOT used here! ''' #plt.ion() f1 = plt.figure() ax1 = f1.add_subplot(111) #ax2 = f1.add_subplot(122) #plt.show() cmt.cuda_set_device(self.gpuId) cmt.cublas_init() cmt.CUDAMatrix.init_random(1) np.random.seed(self.npRandSeed) prng = RandomState(self.npRandState) ################################################################ ##################### CHANGE PATH ############################## # Move to current experiment path: os.chdir(self.saveDir) # Get current path: os.getcwd() self.plotsDir = 'plots' #self.probabilitiesDir = 'p_all' if not os.path.isdir(self.plotsDir): os.makedirs(self.plotsDir) if not os.path.isdir(self.plotsDir + '/energy'): os.makedirs(self.plotsDir + '/energy') #if not os.path.isdir(self.probabilitiesDir): # os.makedirs(self.probabilitiesDir) if not os.path.isdir('weights'): os.makedirs('weights') d = self.d.astype(np.float32) print("visible size: ", d.shape) dsq = np.square(d) lsq = np.sum(dsq, axis=0) with open('lsqComplete.pkl', 'wb') as pklFile: cPickle.dump(lsq, pklFile) del dsq, lsq # Save visible data : visData = d np.savez('visData.npz', data=d, obsKeys=self.obsKeys, epochTime=self.epochTime) with open('visData.txt', 'w') as f: f.write("\n Dataset : %s" % (self.dataFilename)) f.write("\n visData size: %s " % str(visData.shape)) f.write("\n visData type: %s " % str(visData.dtype)) f.write("\n \n visData Range: %s " % str(np.max(visData, axis=0) - np.min(visData, axis=0))) f.write("\n \n visData min: %s " % str(np.min(visData, axis=0))) f.write("\n \n visData max: %s " % str(np.max(visData, axis=0))) f.write("\n \n visData mean: %s " % str(np.mean(visData, axis=0))) f.write("\n \n visData std: %s " % str(np.std(visData, axis=0))) f.close() del visData #if not needed for computing the latent states permIdx = prng.permutation(d.shape[0]) d = d[permIdx, :] #subsetting train and test datasets #trainPerc = 0.7 #trainSampNum = int(np.ceil(trainPerc*d.shape[0])) #trainSampNum = int(np.floor(trainSampNum/self.batch_size)*self.batch_size) #testSampNum = int(d.shape[0]-trainSampNum-1) # The test dataset is not used at the moment, it can be used as # a validation set to check for overfitting. To use it, uncomment # all the variables with 'test' in their name #~ d_test = d[trainSampNum+1:,:] #d = d[:trainSampNum,:] #obsKeys = self.obsKeys[:trainSampNum] totnumcases = d.shape[0] num_vis = d.shape[1] num_batches = int(totnumcases / self.batch_size) print("num_batches: ", num_batches) dev_dat = cmt.CUDAMatrix(d.T) # VxP #~ test_dat = cmt.CUDAMatrix(d_test.T) del d, self.d, self.epochTime, self.obsKeys # training parameters (as in the original code by Ranzato) epsilon = self.epsilon epsilonVF = 2 * epsilon epsilonFH = 0.02 * epsilon epsilonb = 0.02 * epsilon epsilonw_mean = 0.2 * epsilon epsilonb_mean = 0.1 * epsilon weightcost_final = self.weightcost_final # HMC setting hmc_step_nr = self.hmc_step_nr hmc_step = 0.01 hmc_target_ave_rej = self.hmc_target_ave_rej hmc_ave_rej = hmc_target_ave_rej # initialize weights VF = cmt.CUDAMatrix( np.array(0.02 * prng.randn(num_vis, self.num_fac), dtype=np.float32, order='F')) # VxH if self.apply_mask == 0: FH = cmt.CUDAMatrix( np.array(np.eye(self.num_fac, self.num_hid_cov), dtype=np.float32, order='F')) # HxO else: dd = loadmat( 'your_FHinit_mask_file.mat' ) # see CVPR2010paper_material/topo2D_3x3_stride2_576filt.mat for an example FH = cmt.CUDAMatrix(np.array(dd["FH"], dtype=np.float32, order='F')) bias_cov = cmt.CUDAMatrix( np.array(2.0 * np.ones((self.num_hid_cov, 1)), dtype=np.float32, order='F')) bias_vis = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F')) w_mean = cmt.CUDAMatrix( np.array(0.05 * prng.randn(num_vis, self.num_hid_mean), dtype=np.float32, order='F')) # VxH bias_mean = cmt.CUDAMatrix( np.array(-2.0 * np.ones((self.num_hid_mean, 1)), dtype=np.float32, order='F')) # initialize variables to store derivatives VFinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, self.num_fac)), dtype=np.float32, order='F')) FHinc = cmt.CUDAMatrix( np.array(np.zeros((self.num_fac, self.num_hid_cov)), dtype=np.float32, order='F')) bias_covinc = cmt.CUDAMatrix( np.array(np.zeros((self.num_hid_cov, 1)), dtype=np.float32, order='F')) bias_visinc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, 1)), dtype=np.float32, order='F')) w_meaninc = cmt.CUDAMatrix( np.array(np.zeros((num_vis, self.num_hid_mean)), dtype=np.float32, order='F')) bias_meaninc = cmt.CUDAMatrix( np.array(np.zeros((self.num_hid_mean, 1)), dtype=np.float32, order='F')) # initialize temporary storage data = cmt.CUDAMatrix( np.array(np.empty((num_vis, self.batch_size)), dtype=np.float32, order='F')) # VxP normdata = cmt.CUDAMatrix( np.array(np.empty((num_vis, self.batch_size)), dtype=np.float32, order='F')) # VxP negdataini = cmt.CUDAMatrix( np.array(np.empty((num_vis, self.batch_size)), dtype=np.float32, order='F')) # VxP feat = cmt.CUDAMatrix( np.array(np.empty((self.num_fac, self.batch_size)), dtype=np.float32, order='F')) featsq = cmt.CUDAMatrix( np.array(np.empty((self.num_fac, self.batch_size)), dtype=np.float32, order='F')) negdata = cmt.CUDAMatrix( np.array(prng.randn(num_vis, self.batch_size), dtype=np.float32, order='F')) old_energy = cmt.CUDAMatrix( np.array(np.zeros((1, self.batch_size)), dtype=np.float32, order='F')) new_energy = cmt.CUDAMatrix( np.array(np.zeros((1, self.batch_size)), dtype=np.float32, order='F')) energy = cmt.CUDAMatrix( np.array(np.zeros((1, self.batch_size)), dtype=np.float32, order='F')) gradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, self.batch_size)), dtype=np.float32, order='F')) # VxP normgradient = cmt.CUDAMatrix( np.array(np.empty((num_vis, self.batch_size)), dtype=np.float32, order='F')) # VxP thresh = cmt.CUDAMatrix( np.array(np.zeros((1, self.batch_size)), dtype=np.float32, order='F')) feat_mean = cmt.CUDAMatrix( np.array(np.empty((self.num_hid_mean, self.batch_size)), dtype=np.float32, order='F')) vel = cmt.CUDAMatrix( np.array(prng.randn(num_vis, self.batch_size), dtype=np.float32, order='F')) length = cmt.CUDAMatrix( np.array(np.zeros((1, self.batch_size)), dtype=np.float32, order='F')) # 1xP lengthsq = cmt.CUDAMatrix( np.array(np.zeros((1, self.batch_size)), dtype=np.float32, order='F')) # 1xP normcoeff = cmt.CUDAMatrix( np.array(np.zeros((1, self.batch_size)), dtype=np.float32, order='F')) # 1xP # commented to avoid computing the energy on test data #~ data_test = cmt.CUDAMatrix( np.array(np.empty((num_vis, testSampNum)), dtype=np.float32, order='F')) # Vxtest_batch #~ normdata_test = cmt.CUDAMatrix( np.array(np.empty((num_vis, testSampNum)), dtype=np.float32, order='F')) # Vxtest_batch #~ length_test = cmt.CUDAMatrix( np.array(np.zeros((1, testSampNum)), dtype=np.float32, order='F')) # 1xtest_batch #~ lengthsq_test = cmt.CUDAMatrix( np.array(np.zeros((1, testSampNum)), dtype=np.float32, order='F')) # 1xtest_batch #~ normcoeff_test = cmt.CUDAMatrix( np.array(np.zeros((1, testSampNum)), dtype=np.float32, order='F')) # 1xtest_batch #~ vel_test = cmt.CUDAMatrix( np.array(prng.randn(num_vis, testSampNum), dtype=np.float32, order='F')) #~ feat_test = cmt.CUDAMatrix( np.array(np.empty((self.num_fac, testSampNum)), dtype=np.float32, order='F')) #~ featsq_test = cmt.CUDAMatrix( np.array(np.empty((self.num_fac, testSampNum)), dtype=np.float32, order='F')) #~ feat_mean_test = cmt.CUDAMatrix( np.array(np.empty((self.num_hid_mean, testSampNum)), dtype=np.float32, order='F')) #~ energy_test = cmt.CUDAMatrix( np.array(np.zeros((1, testSampNum)), dtype=np.float32, order='F')) if self.apply_mask == 1: # this used to constrain very large FH matrices only allowing to change values in a neighborhood dd = loadmat('your_FHinit_mask_file.mat') mask = cmt.CUDAMatrix( np.array(dd["mask"], dtype=np.float32, order='F')) normVF = 1 small = 0.5 # other temporary vars t1 = cmt.CUDAMatrix( np.array(np.empty((self.num_hid_cov, self.batch_size)), dtype=np.float32, order='F')) t2 = cmt.CUDAMatrix( np.array(np.empty((self.num_hid_cov, self.batch_size)), dtype=np.float32, order='F')) t3 = cmt.CUDAMatrix( np.array(np.empty((self.num_fac, self.batch_size)), dtype=np.float32, order='F')) t4 = cmt.CUDAMatrix( np.array(np.empty((1, self.batch_size)), dtype=np.float32, order='F')) t5 = cmt.CUDAMatrix( np.array(np.empty((1, 1)), dtype=np.float32, order='F')) t6 = cmt.CUDAMatrix( np.array(np.empty((num_vis, self.batch_size)), dtype=np.float32, order='F')) t7 = cmt.CUDAMatrix( np.array(np.empty((num_vis, self.batch_size)), dtype=np.float32, order='F')) t8 = cmt.CUDAMatrix( np.array(np.empty((num_vis, self.num_fac)), dtype=np.float32, order='F')) t9 = cmt.CUDAMatrix( np.array(np.zeros((self.num_fac, self.num_hid_cov)), dtype=np.float32, order='F')) t10 = cmt.CUDAMatrix( np.array(np.empty((1, self.num_fac)), dtype=np.float32, order='F')) t11 = cmt.CUDAMatrix( np.array(np.empty((1, self.num_hid_cov)), dtype=np.float32, order='F')) # commented to avoid computing the energy on test data #~ t1_test = cmt.CUDAMatrix( np.array(np.empty((self.num_hid_cov, testSampNum)), dtype=np.float32, order='F')) #~ t2_test = cmt.CUDAMatrix( np.array(np.empty((self.num_hid_cov, testSampNum)), dtype=np.float32, order='F')) #~ t3_test = cmt.CUDAMatrix( np.array(np.empty((self.num_fac, testSampNum)), dtype=np.float32, order='F')) #~ t4_test = cmt.CUDAMatrix( np.array(np.empty((1,testSampNum)), dtype=np.float32, order='F')) #~ t5_test = cmt.CUDAMatrix( np.array(np.empty((1,1)), dtype=np.float32, order='F')) #~ t6_test = cmt.CUDAMatrix( np.array(np.empty((num_vis, testSampNum)), dtype=np.float32, order='F')) meanEnergy = np.zeros(self.num_epochs) minEnergy = np.zeros(self.num_epochs) maxEnergy = np.zeros(self.num_epochs) #~ meanEnergy_test = np.zeros(self.num_epochs) #~ minEnergy_test = np.zeros(self.num_epochs) #~ maxEnergy_test = np.zeros(self.num_epochs) # start training for epoch in range(self.num_epochs): print "Epoch " + str(epoch) # anneal learning rates as found in the original code - # uncomment if you wish to use annealing! #~ epsilonVFc = epsilonVF/max(1,epoch/20) #~ epsilonFHc = epsilonFH/max(1,epoch/20) #~ epsilonbc = epsilonb/max(1,epoch/20) #~ epsilonw_meanc = epsilonw_mean/max(1,epoch/20) #~ epsilonb_meanc = epsilonb_mean/max(1,epoch/20) # no annealing is used in our experiments because learning # was stopping too early epsilonVFc = epsilonVF epsilonFHc = epsilonFH epsilonbc = epsilonb epsilonw_meanc = epsilonw_mean epsilonb_meanc = epsilonb_mean weightcost = weightcost_final if epoch <= self.startFH: epsilonFHc = 0 if epoch <= self.startwd: weightcost = 0 # commented to avoid computing the energy on test data #~ data_test = test_dat #~ data_test.mult(data_test, target = t6_test) # DxP #~ t6_test.sum(axis = 0, target = lengthsq_test) # 1xP #~ lengthsq_test.mult(1./num_vis) # normalize by number of components (like std) #~ lengthsq_test.add(small) # small avoids division by 0 #~ cmt.sqrt(lengthsq_test, target = length_test) #~ length_test.reciprocal(target = normcoeff_test) # 1xP #~ data_test.mult_by_row(normcoeff_test, target = normdata_test) # normalized data for batch in range(num_batches): # get current minibatch data = dev_dat.slice( batch * self.batch_size, (batch + 1) * self.batch_size) # DxP (nr dims x nr samples) # normalize input data data.mult(data, target=t6) # DxP t6.sum(axis=0, target=lengthsq) # 1xP lengthsq.mult( 1. / num_vis) # normalize by number of components (like std) lengthsq.add(small) # small avoids division by 0 cmt.sqrt(lengthsq, target=length) length.reciprocal(target=normcoeff) # 1xP data.mult_by_row(normcoeff, target=normdata) # normalized data ## compute positive sample derivatives # covariance part cmt.dot(VF.T, normdata, target=feat) # HxP (nr facs x nr samples) feat.mult(feat, target=featsq) # HxP cmt.dot(FH.T, featsq, target=t1) # OxP (nr cov hiddens x nr samples) t1.mult(-0.5) t1.add_col_vec(bias_cov) # OxP t1.apply_sigmoid(target=t2) # OxP cmt.dot(featsq, t2.T, target=FHinc) # HxO cmt.dot(FH, t2, target=t3) # HxP t3.mult(feat) cmt.dot(normdata, t3.T, target=VFinc) # VxH t2.sum(axis=1, target=bias_covinc) bias_covinc.mult(-1) # visible bias data.sum(axis=1, target=bias_visinc) bias_visinc.mult(-1) # mean part cmt.dot(w_mean.T, data, target=feat_mean) # HxP (nr mean hiddens x nr samples) feat_mean.add_col_vec(bias_mean) # HxP feat_mean.apply_sigmoid() # HxP feat_mean.mult(-1) cmt.dot(data, feat_mean.T, target=w_meaninc) feat_mean.sum(axis=1, target=bias_meaninc) # HMC sampling: draw an approximate sample from the model if self.doPCD == 0: # CD-1 (set negative data to current training samples) hmc_step, hmc_ave_rej = self.draw_HMC_samples( data, negdata, normdata, vel, gradient, normgradient, new_energy, old_energy, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, hmc_step, hmc_step_nr, hmc_ave_rej, hmc_target_ave_rej, t1, t2, t3, t4, t5, t6, t7, thresh, feat, featsq, self.batch_size, feat_mean, length, lengthsq, normcoeff, small, num_vis) else: # PCD-1 (use previous negative data as starting point for chain) negdataini.assign(negdata) hmc_step, hmc_ave_rej = self.draw_HMC_samples( negdataini, negdata, normdata, vel, gradient, normgradient, new_energy, old_energy, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, hmc_step, hmc_step_nr, hmc_ave_rej, hmc_target_ave_rej, t1, t2, t3, t4, t5, t6, t7, thresh, feat, featsq, self.batch_size, feat_mean, length, lengthsq, normcoeff, small, num_vis) # compute derivatives at the negative samples # normalize input data negdata.mult(negdata, target=t6) # DxP t6.sum(axis=0, target=lengthsq) # 1xP lengthsq.mult( 1. / num_vis) # normalize by number of components (like std) lengthsq.add(small) cmt.sqrt(lengthsq, target=length) length.reciprocal(target=normcoeff) # 1xP negdata.mult_by_row(normcoeff, target=normdata) # normalized data # covariance part cmt.dot(VF.T, normdata, target=feat) # HxP feat.mult(feat, target=featsq) # HxP cmt.dot(FH.T, featsq, target=t1) # OxP t1.mult(-0.5) t1.add_col_vec(bias_cov) # OxP t1.apply_sigmoid(target=t2) # OxP FHinc.subtract_dot(featsq, t2.T) # HxO FHinc.mult(0.5) cmt.dot(FH, t2, target=t3) # HxP t3.mult(feat) VFinc.subtract_dot(normdata, t3.T) # VxH bias_covinc.add_sums(t2, axis=1) # visible bias bias_visinc.add_sums(negdata, axis=1) # mean part cmt.dot(w_mean.T, negdata, target=feat_mean) # HxP feat_mean.add_col_vec(bias_mean) # HxP feat_mean.apply_sigmoid() # HxP w_meaninc.add_dot(negdata, feat_mean.T) bias_meaninc.add_sums(feat_mean, axis=1) # update parameters VFinc.add_mult(VF.sign(), weightcost) # L1 regularization VF.add_mult(VFinc, -epsilonVFc / self.batch_size) # normalize columns of VF: normalize by running average of their norm VF.mult(VF, target=t8) t8.sum(axis=0, target=t10) cmt.sqrt(t10) t10.sum(axis=1, target=t5) t5.copy_to_host() normVF = .95 * normVF + ( .05 / self.num_fac) * t5.numpy_array[0, 0] # estimate norm t10.reciprocal() VF.mult_by_row(t10) VF.mult(normVF) bias_cov.add_mult(bias_covinc, -epsilonbc / self.batch_size) bias_vis.add_mult(bias_visinc, -epsilonbc / self.batch_size) if epoch > self.startFH: FHinc.add_mult(FH.sign(), weightcost) # L1 regularization FH.add_mult(FHinc, -epsilonFHc / self.batch_size) # update # set to 0 negative entries in FH FH.greater_than(0, target=t9) FH.mult(t9) if self.apply_mask == 1: FH.mult(mask) # normalize columns of FH: L1 norm set to 1 in each column FH.sum(axis=0, target=t11) t11.reciprocal() FH.mult_by_row(t11) w_meaninc.add_mult(w_mean.sign(), weightcost) w_mean.add_mult(w_meaninc, -epsilonw_meanc / self.batch_size) bias_mean.add_mult(bias_meaninc, -epsilonb_meanc / self.batch_size) if self.verbose == 1: print "VF: " + '%3.2e' % VF.euclid_norm( ) + ", DVF: " + '%3.2e' % ( VFinc.euclid_norm() * (epsilonVFc / self.batch_size) ) + ", FH: " + '%3.2e' % FH.euclid_norm( ) + ", DFH: " + '%3.2e' % ( FHinc.euclid_norm() * (epsilonFHc / self.batch_size) ) + ", bias_cov: " + '%3.2e' % bias_cov.euclid_norm( ) + ", Dbias_cov: " + '%3.2e' % ( bias_covinc.euclid_norm() * (epsilonbc / self.batch_size) ) + ", bias_vis: " + '%3.2e' % bias_vis.euclid_norm( ) + ", Dbias_vis: " + '%3.2e' % ( bias_visinc.euclid_norm() * (epsilonbc / self.batch_size) ) + ", wm: " + '%3.2e' % w_mean.euclid_norm( ) + ", Dwm: " + '%3.2e' % ( w_meaninc.euclid_norm() * (epsilonw_meanc / self.batch_size) ) + ", bm: " + '%3.2e' % bias_mean.euclid_norm( ) + ", Dbm: " + '%3.2e' % ( bias_meaninc.euclid_norm() * (epsilonb_meanc / self.batch_size) ) + ", step: " + '%3.2e' % hmc_step + ", rej: " + '%3.2e' % hmc_ave_rej with open('terminal.txt', 'a') as f: f.write('\n' + "epoch: %s" % str(epoch) + ", VF: " + '%3.2e' % VF.euclid_norm() + ", DVF: " + '%3.2e' % (VFinc.euclid_norm() * (epsilonVFc / self.batch_size)) + ", FH: " + '%3.2e' % FH.euclid_norm() + ", DFH: " + '%3.2e' % (FHinc.euclid_norm() * (epsilonFHc / self.batch_size)) + ", bias_cov: " + '%3.2e' % bias_cov.euclid_norm() + ", Dbias_cov: " + '%3.2e' % (bias_covinc.euclid_norm() * (epsilonbc / self.batch_size)) + ", bias_vis: " + '%3.2e' % bias_vis.euclid_norm() + ", Dbias_vis: " + '%3.2e' % (bias_visinc.euclid_norm() * (epsilonbc / self.batch_size)) + ", wm: " + '%3.2e' % w_mean.euclid_norm() + ", Dwm: " + '%3.2e' % (w_meaninc.euclid_norm() * (epsilonw_meanc / self.batch_size)) + ", bm: " + '%3.2e' % bias_mean.euclid_norm() + ", Dbm: " + '%3.2e' % (bias_meaninc.euclid_norm() * (epsilonb_meanc / self.batch_size)) + ", step: " + '%3.2e' % hmc_step + ", rej: " + '%3.2e' % hmc_ave_rej) sys.stdout.flush() # commented to avoid computing the energy on trainig data self.compute_energy_mcRBM_visual(data, normdata, energy, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, t1, t2, t6, feat, featsq, feat_mean, length, lengthsq, normcoeff, small, num_vis) energy.copy_to_host() meanEnergy[epoch] = np.mean(energy.numpy_array) minEnergy[epoch] = np.min(energy.numpy_array) maxEnergy[epoch] = np.max(energy.numpy_array) # commented to avoid computing the energy on test data #~ self.compute_energy_mcRBM_visual(data_test,normdata_test,energy_test,VF,FH,bias_cov,bias_vis,w_mean,bias_mean,t1_test,t2_test,t6_test,feat_test,featsq_test,feat_mean_test,length_test,lengthsq_test,normcoeff_test,small,num_vis) #~ energy_test.copy_to_host() #~ meanEnergy_test[epoch] = np.mean(energy_test.numpy_array) #~ minEnergy_test[epoch] = np.min(energy_test.numpy_array) #~ maxEnergy_test[epoch] = np.max(energy_test.numpy_array) ax1.cla() ax1.plot(range(epoch), meanEnergy[0:epoch]) ax1.plot(range(epoch), maxEnergy[0:epoch]) ax1.plot(range(epoch), minEnergy[0:epoch]) if np.mod(epoch, 100) == 0: #f1.savefig(output_folder + str(epoch)+'_'+'fig.png') f1.savefig(self.plotsDir + '/energy/energyAt_%s.png' % str(epoch)) # back-up every once in a while if np.mod(epoch, 100) == 0: VF.copy_to_host() FH.copy_to_host() bias_cov.copy_to_host() w_mean.copy_to_host() bias_mean.copy_to_host() bias_vis.copy_to_host() savemat( "./weights/ws_temp%s" % str(epoch), { 'VF': VF.numpy_array, 'FH': FH.numpy_array, 'bias_cov': bias_cov.numpy_array, 'bias_vis': bias_vis.numpy_array, 'w_mean': w_mean.numpy_array, 'bias_mean': bias_mean.numpy_array, 'epoch': epoch }) # uncomment if computing the energy in order to store its evolution throghout training #~ savemat(self.refDir + '/' + "training_energy_" + str(self.num_fac) + "_cov" + str(self.num_hid_cov) + "_mean" + str(self.num_hid_mean), {'meanEnergy':meanEnergy,'meanEnergy_test':meanEnergy_test,'maxEnergy': maxEnergy, 'maxEnergy_test': maxEnergy_test, 'minEnergy': minEnergy, 'minEnergy_test': minEnergy_test, 'epoch':epoch}) #savemat("training_energy_" + str(self.num_fac) + "_cov" + str(self.num_hid_cov) + "_mean" + str(self.num_hid_mean), {'meanEnergy':meanEnergy, 'maxEnergy': maxEnergy, 'minEnergy': minEnergy, 'epoch':epoch}) # in order to stop the training gracefully, create an empty file # named 'stop_now' in the folder containing the experiment # configuration file if os.path.isfile('stop_now'): break # final back-up VF.copy_to_host() FH.copy_to_host() bias_cov.copy_to_host() bias_vis.copy_to_host() w_mean.copy_to_host() bias_mean.copy_to_host() savemat( "ws_fac%s" % str(self.num_fac) + "_cov%s" % str(self.num_hid_cov) + "_mean%s" % str(self.num_hid_mean), { 'VF': VF.numpy_array, 'FH': FH.numpy_array, 'bias_cov': bias_cov.numpy_array, 'bias_vis': bias_vis.numpy_array, 'w_mean': w_mean.numpy_array, 'bias_mean': bias_mean.numpy_array, 'epoch': epoch }) # uncomment if computing the energy in order to store its evolution throghout training #~ savemat(self.refDir + '/' + "training_energy_" + str(self.num_fac) + "_cov" + str(self.num_hid_cov) + "_mean" + str(self.num_hid_mean), {'meanEnergy':meanEnergy,'meanEnergy_test':meanEnergy_test,'maxEnergy': maxEnergy, 'maxEnergy_test': maxEnergy_test, 'minEnergy': minEnergy, 'minEnergy_test': minEnergy_test, 'epoch':epoch}) savemat( "training_energy_" + str(self.num_fac) + "_cov" + str(self.num_hid_cov) + "_mean" + str(self.num_hid_mean), { 'meanEnergy': meanEnergy, 'maxEnergy': maxEnergy, 'minEnergy': minEnergy, 'epoch': epoch }) # Compute states if desired: # normalise data for covariance hidden: #dsq = np.square(visData) #lsq = np.sum(dsq, axis=0) #lsq /= visData.shape[1] #lsq += np.spacing(1) #l = np.sqrt(lsq) #normD = visData/l #logisticArg_c = (-0.5*np.dot(FH.numpy_array.T, np.square(np.dot(VF.numpy_array.T, normD.T))) + bias_cov.numpy_array).T #p_hc = logisticFunc(logisticArg_c) #logisticArg_m = np.dot(visData, w_mean.numpy_array) + bias_mean.numpy_array.T #p_hm = logisticFunc(logisticArg_m) #p_all = np.concatenate((p_hc, p_hm), axis=1) #savemat(self.probabilitiesDir + '/pAll_%i.mat' % epoch, mdict={'p_all':p_all}) with open('done', 'w') as doneFile: doneFile.write( datetime.strftime(datetime.now(), '%d/%m/%Y %H:%M:%S'))
def run(self, iterations): for i in range(0,iterations): # update W cm.dot(self.W_gpu, self.H_gpu, target=self.WH_gpu) cm.dot(self.X_gpu, self.H_gpu.T, target=self.XHT_gpu) cm.dot(self.WH_gpu, self.H_gpu.T, target=self.WHHT_gpu) self.WHHT_gpu.add(self.sparseW) self.W_gpu.mult(self.XHT_gpu).divide(self.WHHT_gpu) # normalize W cm.dot(self.nones_gpu, self.W_gpu, target=self.Wrowsum_gpu) # slower correct version: W_gpu.sum(0, target=rowsum_gpu) self.W_gpu.div_by_row(self.Wrowsum_gpu) # update H cm.dot(self.W_gpu.T, self.X_gpu, target=self.WTX_gpu) cm.dot(self.W_gpu.T, self.WH_gpu, target=self.WTWH_gpu) self.WTWH_gpu.add(self.sparseH) self.H_gpu.mult(self.WTX_gpu).divide(self.WTWH_gpu) # test for convergence if (i % self.niter_test_conv == 0) and self.checkConvergence(): print "NMF converged after %i iterations" % i break
out = cm.empty((dim_out, batch_size)) delta = cm.empty((num_hid, batch_size)) # Train neural network. start_time = time.time() for epoch in range(num_epochs): print("Epoch %i" % (epoch + 1)) err = [] for batch in range(num_batches): # get current minibatch inp = dev_train.slice(batch * batch_size, (batch + 1) * batch_size) target = dev_lbl.slice(batch * batch_size, (batch + 1) * batch_size) # forward pass cm.dot(w_w1.T, inp, target=h) h.add_col_vec(w_b1) h.apply_sigmoid() cm.dot(w_w2.T, h, target=out) out.add_col_vec(w_b2) out.apply_sigmoid() # back prop errors out.subtract(target) # compute error # gradients for w_w2 and w_b2 wu_w2.add_dot(h, out.T, beta=momentum) wu_b2.add_sums(out, axis=1, beta=momentum)