def ApplyActivation(self): state = self.state if self.activation == deepnet_pb2.Hyperparams.LOGISTIC: cm.sigmoid(state) elif self.activation == deepnet_pb2.Hyperparams.TANH: cm.tanh(state) elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR: state.greater_than(0, target=self.temp) state.mult(self.temp) elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR_SMOOTH: cm.log_1_plus_exp(state) elif self.activation == deepnet_pb2.Hyperparams.LINEAR: pass elif self.activation == deepnet_pb2.Hyperparams.SOFTMAX: state.max(axis=0, target=self.temp) state.add_row_mult(self.temp, -1) cm.exp(state) state.sum(axis=0, target=self.temp) self.temp.reciprocal() state.mult_by_row(self.temp) elif self.activation == deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX: state.max(axis=0, target=self.temp) state.add_row_mult(self.temp, -1) cm.exp(state) state.sum(axis=0, target=self.temp) self.NN.divide(self.temp, target=self.temp) state.mult_by_row(self.temp) else: raise Exception("Unknown activation")
def maskedSingleSoftmax(netInputs, tempMatrix, sMask, notSMask, onesCol, tempRow): """ We now assume we have a single k way softmax and some number of Gaussian units. So we only want to apply the softmax activation function to the first k rows of netInputs. """ assert (onesCol.shape[0] == netInputs.shape[0]) assert (tempRow.shape[1] == netInputs.shape[1]) assert (tempRow.shape[0] == onesCol.shape[1]) assert (netInputs.shape == tempMatrix.shape == sMask.shape == notSMask.shape) c = num.finfo(num.float32).min / 16 assert (num.exp(c + 200) == 0.0) netInputs.mult(sMask, target=tempMatrix) tempMatrix.add_mult(notSMask, c) tempMatrix.max(axis=0, target=tempRow) #onesCol.assign_scalar(1) tempMatrix.subtract_dot(onesCol, tempRow) cm.exp(tempMatrix) tempMatrix.sum(axis=0, target=tempRow) tempRow.reciprocal() tempMatrix.mult_by_row(tempRow) netInputs.mult(notSMask) tempMatrix.mult(sMask) netInputs.add(tempMatrix)
def ExactZ_binary_binary(model): assert len(model.layer) == 2, 'Only implemented for RBMs.' steps = len(schedule) input_layer = model.layer[0] hidden_layer = model.layer[1] edge = model.edge[0] w = edge.params['weight'] a = hidden_layer.params['bias'] b = input_layer.params['bias'] numvis, numhid = w.shape batchsize = 2**numvis input_layer.AllocateBatchsizeDependentMemory(batchsize) hidden_layer.AllocateBatchsizeDependentMemory(batchsize) all_inputs = GetAll(numvis) w_ais = cm.CUDAMatrix(np.zeros((1, batchsize))) input_layer.sample.overwrite(all_inputs) cm.dot(w.T, input_layer.sample, target=hidden_layer.state) hidden_layer.state.add_col_vec(a) cm.log_1_plus_exp(hidden_layer.state) w_ais.add_sums(hidden_layer.state, axis=0) w_ais.add_dot(b.T, input_layer.state) offset = float(w_ais.asarray().max()) w_ais.subtract(offset) cm.exp(w_ais) z = offset + np.log(w_ais.asarray().sum()) return z
def maskedSingleSoftmax(netInputs, tempMatrix, sMask, notSMask, onesCol, tempRow): """ We now assume we have a single k way softmax and some number of Gaussian units. So we only want to apply the softmax activation function to the first k rows of netInputs. """ assert(onesCol.shape[0] == netInputs.shape[0]) assert(tempRow.shape[1] == netInputs.shape[1]) assert(tempRow.shape[0] == onesCol.shape[1]) assert(netInputs.shape == tempMatrix.shape == sMask.shape == notSMask.shape) c = num.finfo(num.float32).min/16 assert(num.exp(c+200) == 0.0) netInputs.mult(sMask, target = tempMatrix) tempMatrix.add_mult(notSMask, c) tempMatrix.max(axis = 0, target = tempRow) #onesCol.assign_scalar(1) tempMatrix.subtract_dot(onesCol, tempRow) cm.exp(tempMatrix) tempMatrix.sum(axis = 0, target = tempRow) tempRow.reciprocal() tempMatrix.mult_by_row(tempRow) netInputs.mult(notSMask) tempMatrix.mult(sMask) netInputs.add(tempMatrix)
def log(self, x): self.tmp1.assign(x) self.tmp1.mult(-1) cm.exp(self.tmp1, target=self.tmp1) self.tmp1.add(1) cm.pow(self.tmp1, -1)
def compute_output(self,gpu_data): """ Computes p(y|x). Puts the result in self.gpu_p_y_given_x. """ cm.dot(self.W,gpu_data,self.gpu_act_from_x) self.gpu_act_from_x.add_col_vec(self.c) for c in range(self.n_classes): cm.dot(self.U,self.gpu_target_vectors.slice(c,c+1),self.gpu_act_from_y) # to avoid memory creation, using gpu_h # and gpu_h_sample for these computations self.gpu_act_from_x.add_col_vec(self.gpu_act_from_y,target=self.gpu_h) cm.exp(self.gpu_h,self.gpu_h_sample) self.gpu_h_sample.add(1.) cm.log(self.gpu_h_sample,self.gpu_h) self.gpu_h.sum(axis=0,target=self.gpu_negative_free_energy_for_y) cm.dot(self.d.T,self.gpu_target_vectors.slice(c,c+1),target=self.gpu_bias_from_y) self.gpu_negative_free_energy_for_y.add_col_vec(self.gpu_bias_from_y) self.gpu_negative_free_energy_for_y.transpose(target=self.gpu_negative_free_energy.slice(c,c+1)) # Subtracting mean for more stable softmax computation self.gpu_negative_free_energy.sum(axis=1,target=self.gpu_mean_negative_free_energy) self.gpu_mean_negative_free_energy.divide(-self.n_classes) self.gpu_negative_free_energy.add_col_vec(self.gpu_mean_negative_free_energy) cm.exp(self.gpu_negative_free_energy,target=self.gpu_negative_free_energy) self.gpu_negative_free_energy.sum(axis=1,target=self.gpu_p_y_given_x_norm) for c in range(self.n_classes): self.gpu_negative_free_energy.slice(c,c+1).divide(self.gpu_p_y_given_x_norm, target=self.gpu_p_y_given_x.slice(c,c+1)) self.gpu_p_y_given_x.transpose(target=self.gpu_p_y_given_x_trans)
def ApplyActivation(self): state = self.state if self.activation == deepnet_pb2.Hyperparams.LOGISTIC: cm.sigmoid(state) elif self.activation == deepnet_pb2.Hyperparams.TANH: cm.tanh(state) elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR: state.greater_than(0, target=self.temp) state.mult(self.temp) elif self.activation == deepnet_pb2.Hyperparams.RECTIFIED_LINEAR_SMOOTH: cm.log_1_plus_exp(state) elif self.activation == deepnet_pb2.Hyperparams.LINEAR: pass elif self.activation == deepnet_pb2.Hyperparams.SOFTMAX: state.max(axis=0, target=self.temp) state.add_row_mult(self.temp, -1) cm.exp(state) state.sum(axis=0, target=self.temp) self.temp.reciprocal() state.mult_by_row(self.temp) elif self.activation == deepnet_pb2.Hyperparams.REPLICATED_SOFTMAX: state.max(axis=0, target=self.temp) state.add_row_mult(self.temp, -1) cm.exp(state) state.sum(axis=0, target=self.temp) self.NN.divide(self.temp, target=self.temp) state.mult_by_row(self.temp) else: raise Exception('Unknown activation')
def HMCSample(self, hActs=None): if hActs == None: hActs = self.hActs epsilon = self.hmcStepSize if self.stepSizeIsMean: epsilon = -self.hmcStepSize * num.log(1.0 - num.random.rand()) self.negVis.assign(self.vis) #sample a velocity and temporal direction self.vel.fill_with_randn() timeDir = 2 * num.random.randint(2) - 1 self.Hamiltonian(self.prevHamil) #half-step self.acceleration() #updates self.accel self.vel.add_mult(self.accel, -0.5 * timeDir * epsilon) self.negVis.add_mult(self.vel, timeDir * epsilon) #full leap-frog steps for s in range(self.hmcSteps - 1): self.acceleration() self.vel.add_mult(self.accel, -timeDir * epsilon) self.negVis.add_mult(self.vel, timeDir * epsilon) #final half-step self.acceleration() self.vel.add_mult(self.accel, -0.5 * timeDir * epsilon) self.negVis.add_mult(self.vel, timeDir * epsilon) self.Hamiltonian(self.hamil) #compute rejections self.prevHamil.subtract( self.hamil, target=self.thresh ) #don't really need this new variable, but it is small cm.exp(self.thresh) self.tempRow.fill_with_rand() self.tempRow.less_than( self.thresh, target=self.tempRow ) #tempRow entries are 0 for reject and 1 for accept self.tempRow.copy_to_host() rejRate = self.tempRow.numpy_array.sum() / float(self.mbsz) rejRate = 1 - rejRate self.negVis.mult_by_row(self.tempRow) #zero out rejected columns negate( self.tempRow) #tempRow entries are 1 for reject and 0 for accept self.vis.mult_by_row(self.tempRow, target=self.tempVisMB) self.negVis.add(self.tempVisMB) smoothing = 0.9 self.runningAvRej = smoothing * self.runningAvRej + ( 1.0 - smoothing) * rejRate tol = 0.05 #perhaps add this in later? right now the step size HAS to change unless it hits a max or min #if self.runningAvRej < self.targetRejRate*(1-tol) or self.runningAvRej < self.targetRejRate*(1+tol): # pass if self.runningAvRej < self.targetRejRate: self.hmcStepSize = min(self.hmcStepSize * 1.01, self.maxStepSize) else: self.hmcStepSize = max(self.hmcStepSize * 0.99, self.minStepSize)
def costAndGrad(self,data,labels): batchSize = data.shape[1] self.setViews(batchSize) # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in self.stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) if i <= len(self.layerSizes): # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) self.deltasC.assign(cm.CUDAMatrix(deltas)) if skip: return cost,self.grad,skip # back prop nl = len(self.layerSizes) i = nl deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(self.stack): # compute gradient cm.dot(deltasIn,self.hActs[i].T,target=self.grad[i][0]) deltasIn.sum(axis=1,target=self.grad[i][1]) # compute next layer deltas if i > 0: self.hActs[i].sign(target=self.tmpGrad) cm.dot(w.T,deltasIn,target=deltasOut) deltasOut.mult(self.tmpGrad) if i == nl: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 return cost,self.grad,skip
def HMCSample(self, hActs = None): if hActs == None: hActs = self.hActs epsilon = self.hmcStepSize if self.stepSizeIsMean: epsilon = -self.hmcStepSize*num.log(1.0-num.random.rand()) self.negVis.assign(self.vis) #sample a velocity and temporal direction self.vel.fill_with_randn() timeDir = 2*num.random.randint(2)-1 self.Hamiltonian(self.prevHamil) #half-step self.acceleration() #updates self.accel self.vel.add_mult(self.accel, -0.5*timeDir*epsilon) self.negVis.add_mult(self.vel, timeDir*epsilon) #full leap-frog steps for s in range(self.hmcSteps-1): self.acceleration() self.vel.add_mult(self.accel, -timeDir*epsilon) self.negVis.add_mult(self.vel, timeDir*epsilon) #final half-step self.acceleration() self.vel.add_mult(self.accel, -0.5*timeDir*epsilon) self.negVis.add_mult(self.vel, timeDir*epsilon) self.Hamiltonian(self.hamil) #compute rejections self.prevHamil.subtract(self.hamil, target = self.thresh) #don't really need this new variable, but it is small cm.exp(self.thresh) self.tempRow.fill_with_rand() self.tempRow.less_than(self.thresh, target = self.tempRow) #tempRow entries are 0 for reject and 1 for accept self.tempRow.copy_to_host() rejRate = self.tempRow.numpy_array.sum()/float(self.mbsz) rejRate = 1-rejRate self.negVis.mult_by_row(self.tempRow) #zero out rejected columns negate(self.tempRow) #tempRow entries are 1 for reject and 0 for accept self.vis.mult_by_row(self.tempRow, target = self.tempVisMB) self.negVis.add(self.tempVisMB) smoothing = 0.9 self.runningAvRej = smoothing*self.runningAvRej + (1.0-smoothing)*rejRate tol = 0.05 #perhaps add this in later? right now the step size HAS to change unless it hits a max or min #if self.runningAvRej < self.targetRejRate*(1-tol) or self.runningAvRej < self.targetRejRate*(1+tol): # pass if self.runningAvRej < self.targetRejRate: self.hmcStepSize = min(self.hmcStepSize*1.01, self.maxStepSize) else: self.hmcStepSize = max(self.hmcStepSize*0.99, self.minStepSize)
def logOnePlusExp(x, temp, targ = None): """ When this function is done, x should contain log(1+exp(x)). We clobber the value of temp. We compute log(1+exp(x)) as x + log(1+exp(-x)), which will hopefully be more finite-precision friendly. """ assert(x.shape == temp.shape) x.mult(-1, target = temp) cm.exp(temp) temp.add(1) cm.log(temp) x.add(temp, target = targ)
def logOnePlusExp(x, temp, targ=None): """ When this function is done, x should contain log(1+exp(x)). We clobber the value of temp. We compute log(1+exp(x)) as x + log(1+exp(-x)), which will hopefully be more finite-precision friendly. """ assert (x.shape == temp.shape) x.mult(-1, target=temp) cm.exp(temp) temp.add(1) cm.log(temp) x.add(temp, target=targ)
def negative_free_energy(self, gpu_data): """ Computes the negative free-energy. Outputs a reference to a pre-allocated GPU variable containing the result. """ cm.dot(self.W, gpu_data, self.gpu_h) self.gpu_h.add_col_vec(self.c) # to avoid memory creation, using gpu_h # and gpu_h_sample for these computations cm.exp(self.gpu_h, self.gpu_h_sample) self.gpu_h_sample.add(1.) cm.log(self.gpu_h_sample, self.gpu_h) self.gpu_h.sum(axis=0, target=self.gpu_negative_free_energy) self.gpu_negative_free_energy.add_dot(self.b.T, gpu_data) return self.gpu_negative_free_energy
def negative_free_energy(self,gpu_data): """ Computes the negative free-energy. Outputs a reference to a pre-allocated GPU variable containing the result. """ cm.dot(self.W,gpu_data,self.gpu_h) self.gpu_h.add_col_vec(self.c) # to avoid memory creation, using gpu_h # and gpu_h_sample for these computations cm.exp(self.gpu_h,self.gpu_h_sample) self.gpu_h_sample.add(1.) cm.log(self.gpu_h_sample,self.gpu_h) self.gpu_h.sum(axis=0,target=self.gpu_negative_free_energy) self.gpu_negative_free_energy.add_dot(self.b.T,gpu_data) return self.gpu_negative_free_energy
def test_exp(): m = 256 n = 128 a = np.array(np.random.randn(m, n), dtype=np.float32, order='F') b = np.array(np.random.randn(m, n), dtype=np.float32, order='F') c = np.exp(a) m1 = cm.CUDAMatrix(a) m2 = cm.CUDAMatrix(b) cm.exp(m1, target = m2) cm.exp(m1) m1.copy_to_host() m2.copy_to_host() assert np.max(np.abs(c - m1.numpy_array)) < 10**-4, "Error in cudamat.exp exceeded threshold" assert np.max(np.abs(c - m2.numpy_array)) < 10**-4, "Error in cudamat.exp exceeded threshold"
def singleSoftmax(netInputs, tempRow): """ We modify netInputs in place to hold the softmax activation probabilities and compute them in a numerically stable way. """ #assert(tempCol.shape[0] == netInputs.shape[0]) #assert(tempRow.shape[0] == tempCol.shape[1]) assert (tempRow.shape[1] == netInputs.shape[1]) netInputs.max(axis=0, target=tempRow) #these two lines should be faster than the two below them and let us remove the tempCol param tempRow.mult(-1) netInputs.add_row_vec(tempRow) #tempCol.assign_scalar(1) #netInputs.subtract_dot(tempCol, tempRow) cm.exp(netInputs) netInputs.sum(axis=0, target=tempRow) tempRow.reciprocal() netInputs.mult_by_row(tempRow)
def compute_energy_mcRBM_visual(self, data, normdata, energy, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, t1, t2, t6, feat, featsq, feat_mean, length, lengthsq, normcoeff, small, num_vis): # normalize input data vectors data.mult(data, target=t6) # DxP (nr input dims x nr samples) t6.sum(axis=0, target=lengthsq) # 1xP lengthsq.mult(0.5, target=energy) # energy of quadratic regularization term lengthsq.mult(1. / num_vis) # normalize by number of components (like std) lengthsq.add(small) # small prevents division by 0 cmt.sqrt(lengthsq, target=length) length.reciprocal(target=normcoeff) # 1xP data.mult_by_row(normcoeff, target=normdata) # normalized data ## potential # covariance contribution cmt.dot(VF.T, normdata, target=feat) # HxP (nr factors x nr samples) feat.mult(feat, target=featsq) # HxP cmt.dot(FH.T, featsq, target=t1) # OxP (nr cov hiddens x nr samples) t1.mult(-0.5) t1.add_col_vec(bias_cov) # OxP cmt.exp(t1) # OxP t1.add(1, target=t2) # OxP cmt.log(t2) t2.mult(-1) energy.add_sums(t2, axis=0) # mean contribution cmt.dot(w_mean.T, data, target=feat_mean) # HxP (nr mean hiddens x nr samples) feat_mean.add_col_vec(bias_mean) # HxP cmt.exp(feat_mean) feat_mean.add(1) cmt.log(feat_mean) feat_mean.mult(-1) energy.add_sums(feat_mean, axis=0) # visible bias term data.mult_by_col(bias_vis, target=t6) t6.mult(-1) # DxP energy.add_sums(t6, axis=0) # 1xP # kinetic data.mult(data, target=t6) energy.add_sums(t6, axis=0, mult=.5)
def singleSoftmax(netInputs, tempRow): """ We modify netInputs in place to hold the softmax activation probabilities and compute them in a numerically stable way. """ #assert(tempCol.shape[0] == netInputs.shape[0]) #assert(tempRow.shape[0] == tempCol.shape[1]) assert(tempRow.shape[1] == netInputs.shape[1]) netInputs.max(axis = 0, target = tempRow) #these two lines should be faster than the two below them and let us remove the tempCol param tempRow.mult(-1) netInputs.add_row_vec(tempRow) #tempCol.assign_scalar(1) #netInputs.subtract_dot(tempCol, tempRow) cm.exp(netInputs) netInputs.sum(axis = 0, target = tempRow) tempRow.reciprocal() netInputs.mult_by_row(tempRow)
def softmax(eta): #temp = cm.empty((eta.shape[0],1)) temp = cm.empty((1,eta.shape[1])) # this is considered to be potential numerical problem if True: eta.max(axis = 0, target = temp) #print eta.shape #print temp.shape temp.mult(-1) eta.add_row_vec(temp) cm.exp(eta) eta.sum(axis = 0, target = temp) temp.reciprocal() eta.mult_by_row(temp) # else: # cm.exp(eta) # eta.sum(axis = 0, target = temp) # temp.reciprocal() # eta.mult_by_col(temp)
def compute_output(self, gpu_data): """ Computes p(y|x). Puts the result in self.gpu_p_y_given_x. """ cm.dot(self.W, gpu_data, self.gpu_act_from_x) self.gpu_act_from_x.add_col_vec(self.c) for c in range(self.n_classes): cm.dot(self.U, self.gpu_target_vectors.slice(c, c + 1), self.gpu_act_from_y) # to avoid memory creation, using gpu_h # and gpu_h_sample for these computations self.gpu_act_from_x.add_col_vec(self.gpu_act_from_y, target=self.gpu_h) cm.exp(self.gpu_h, self.gpu_h_sample) self.gpu_h_sample.add(1.) cm.log(self.gpu_h_sample, self.gpu_h) self.gpu_h.sum(axis=0, target=self.gpu_negative_free_energy_for_y) cm.dot(self.d.T, self.gpu_target_vectors.slice(c, c + 1), target=self.gpu_bias_from_y) self.gpu_negative_free_energy_for_y.add_col_vec( self.gpu_bias_from_y) self.gpu_negative_free_energy_for_y.transpose( target=self.gpu_negative_free_energy.slice(c, c + 1)) # Subtracting mean for more stable softmax computation self.gpu_negative_free_energy.sum( axis=1, target=self.gpu_mean_negative_free_energy) self.gpu_mean_negative_free_energy.divide(-self.n_classes) self.gpu_negative_free_energy.add_col_vec( self.gpu_mean_negative_free_energy) cm.exp(self.gpu_negative_free_energy, target=self.gpu_negative_free_energy) self.gpu_negative_free_energy.sum(axis=1, target=self.gpu_p_y_given_x_norm) for c in range(self.n_classes): self.gpu_negative_free_energy.slice(c, c + 1).divide( self.gpu_p_y_given_x_norm, target=self.gpu_p_y_given_x.slice(c, c + 1)) self.gpu_p_y_given_x.transpose(target=self.gpu_p_y_given_x_trans)
def project_words_gpu(projection_matrix, similarity_matrix, kernel_name, hyperparam): import cudamat as cm if kernel_name == "poly": k = cm.pow(cm.CUDAMatrix(similarity_matrix), hyperparam) elif kernel_name == 'rbf': k = cm.exp((cm.pow(cm.CUDAMatrix(1 - similarity_matrix), 2)).mult(-hyperparam)) else: raise NotImplementedError(f'{kernel_name} not yet implemented for GPU') return cm.dot(k, cm.CUDAMatrix(projection_matrix)).asarray()
def costAndGrad(self, data, labels=None, sentence=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-2] wtf, _ = self.stack[-2] wtb, _ = self.stack[-1] if self.train: grad = self.grad[:-2] dwtf, _ = self.grad[-2] dwtb, _ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop #TODO copy to device here self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w, b in stack: cm.dot(w, self.hActs[i - 1], self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: self.hActsFor.assign(self.hActs[i]) self.hActsBack.assign(self.hActs[i]) self.hActsFor.minmax(0.0, self.maxAct, col=0) self.hActsBack.minmax(0.0, self.maxAct, col=T - 1) for t in xrange(1, T): cm.mvdot_col_slice(wtf, self.hActsFor, t - 1, self.hActsFor, t, beta=1.0) self.hActsFor.minmax(0.0, self.maxAct, col=t) cm.mvdot_col_slice(wtb, self.hActsBack, T - t, self.hActsBack, T - t - 1, beta=1.0) self.hActsBack.minmax(0.0, self.maxAct, col=T - t - 1) self.hActsFor.add(self.hActsBack, target=self.hActs[i]) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0, target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0, target=self.rowVec) cm.pow(self.rowVec, -1.0, target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: probs = self.probs.numpy_array return probs cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype( np.float64), labels, blank=0) if self.reg > 0: self.regcost = 0.0 for w, b in self.stack: rc = (self.reg / 2.0) * (w.euclid_norm()**2) self.regcost += rc cost = cost + rc if skip: return cost, self.grad, skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn, deltasOut = self.deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient # gradient for w cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0]) if self.reg > 0: grad[i][0].add_mult(w, alpha=self.reg) # gradient for b deltasIn.sum(axis=1, target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T, deltasIn, target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActsFor.within(0.0, self.maxAct, target=self.tmpGradFor) self.hActsBack.within(0.0, self.maxAct, target=self.tmpGradBack) self.deltasFor.assign(deltasOut) self.deltasBack.assign(deltasOut) self.deltasFor.mult_slice(T - 1, self.tmpGradFor, T - 1) self.deltasBack.mult_slice(0, self.tmpGradBack, 0) for t in xrange(1, T): # Add in temporal delta cm.mvdot_col_slice(wtf.T, self.deltasFor, T - t, self.deltasFor, T - t - 1, beta=1.0) cm.mvdot_col_slice(wtb.T, self.deltasBack, t - 1, self.deltasBack, t, beta=1.0) # Push through activation fn self.deltasFor.mult_slice(T - t - 1, self.tmpGradFor, T - t - 1) self.deltasBack.mult_slice(t, self.tmpGradBack, t) # Accumulate temporal gradient cm.dot(self.deltasFor.get_col_slice(1, T), self.hActsFor.get_col_slice(0, T - 1).T, target=dwtf) cm.dot(self.deltasBack.get_col_slice(0, T - 1), self.hActsBack.get_col_slice(1, T).T, target=dwtb) # Accumulate next layer deltas self.deltasFor.add(self.deltasBack, target=deltasOut) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 if self.reg > 0: if self.temporalLayer > 0: dwtf.add_mult(wtf, alpha=self.reg) dwtb.add_mult(wtb, alpha=self.reg) return cost, self.grad, skip
def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False, log=False, returnAsGPU=False): # init data Nini = len(a) Nfin = len(b) if log: log = {'err': []} # we assume that no distances are null except those of the diagonal of # distances u = (np.ones(Nini) / Nini).reshape((Nini, 1)) u_GPU = cudamat.CUDAMatrix(u) a_GPU = cudamat.CUDAMatrix(a.reshape((Nini, 1))) ones_GPU = cudamat.empty(u_GPU.shape).assign(1) v = (np.ones(Nfin) / Nfin).reshape((Nfin, 1)) v_GPU = cudamat.CUDAMatrix(v) b_GPU = cudamat.CUDAMatrix(b.reshape((Nfin, 1))) M_GPU.divide(-reg) K_GPU = cudamat.exp(M_GPU) ones_GPU.divide(a_GPU, target=a_GPU) Kp_GPU = cudamat.empty(K_GPU.shape) K_GPU.mult_by_col(a_GPU, target=Kp_GPU) tmp_GPU = cudamat.empty(K_GPU.shape) cpt = 0 err = 1 while (err > stopThr and cpt < numItermax): uprev_GPU = u_GPU.copy() vprev_GPU = v_GPU.copy() KtransposeU_GPU = K_GPU.transpose().dot(u_GPU) b_GPU.divide(KtransposeU_GPU, target=v_GPU) ones_GPU.divide(Kp_GPU.dot(v_GPU), target=u_GPU) if (np.any(KtransposeU_GPU.asarray() == 0) or not u_GPU.allfinite() or not v_GPU.allfinite()): # we have reached the machine precision # come back to previous solution and quit loop print(('Warning: numerical errors at iteration', cpt)) u_GPU = uprev_GPU.copy() v_GPU = vprev_GPU.copy() break if cpt % 10 == 0: # we can speed up the process by checking for the error only all # the 10th iterations K_GPU.mult_by_col(u_GPU, target=tmp_GPU) tmp_GPU.mult_by_row(v_GPU.transpose(), target=tmp_GPU) bcopy_GPU = b_GPU.copy().transpose() bcopy_GPU.add_sums(tmp_GPU, axis=0, beta=-1) err = bcopy_GPU.euclid_norm()**2 if log: log['err'].append(err) if verbose: if cpt % 200 == 0: print(('{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19)) print(('{:5d}|{:8e}|'.format(cpt, err))) cpt += 1 if log: log['u'] = u_GPU.asarray() log['v'] = v_GPU.asarray() K_GPU.mult_by_col(u_GPU, target=K_GPU) K_GPU.mult_by_row(v_GPU.transpose(), target=K_GPU) if returnAsGPU: res = K_GPU else: res = K_GPU.asarray() if log: return res, log else: return res
def draw_HMC_samples(self, data, negdata, normdata, vel, gradient, normgradient, new_energy, old_energy, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, hmc_step, hmc_step_nr, hmc_ave_rej, hmc_target_ave_rej, t1, t2, t3, t4, t5, t6, t7, thresh, feat, featsq, batch_size, feat_mean, length, lengthsq, normcoeff, small, num_vis): vel.fill_with_randn() negdata.assign(data) self.compute_energy_mcRBM(negdata, normdata, vel, old_energy, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, t1, t2, t6, feat, featsq, feat_mean, length, lengthsq, normcoeff, small, num_vis) self.compute_gradient_mcRBM(negdata, normdata, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, t1, t2, t3, t4, t6, feat, featsq, feat_mean, gradient, normgradient, length, lengthsq, normcoeff, small, num_vis) # half step vel.add_mult(gradient, -0.5 * hmc_step) negdata.add_mult(vel, hmc_step) # full leap-frog steps for ss in range(hmc_step_nr - 1): ## re-evaluate the gradient self.compute_gradient_mcRBM(negdata, normdata, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, t1, t2, t3, t4, t6, feat, featsq, feat_mean, gradient, normgradient, length, lengthsq, normcoeff, small, num_vis) # update variables vel.add_mult(gradient, -hmc_step) negdata.add_mult(vel, hmc_step) # final half-step self.compute_gradient_mcRBM(negdata, normdata, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, t1, t2, t3, t4, t6, feat, featsq, feat_mean, gradient, normgradient, length, lengthsq, normcoeff, small, num_vis) vel.add_mult(gradient, -0.5 * hmc_step) # compute new energy self.compute_energy_mcRBM(negdata, normdata, vel, new_energy, VF, FH, bias_cov, bias_vis, w_mean, bias_mean, t1, t2, t6, feat, featsq, feat_mean, length, lengthsq, normcoeff, small, num_vis) # rejecton old_energy.subtract(new_energy, target=thresh) cmt.exp(thresh) t4.fill_with_rand() t4.less_than(thresh) # update negdata and rejection rate t4.mult(-1) t4.add(1) # now 1's detect rejections t4.sum(axis=1, target=t5) t5.copy_to_host() rej = t5.numpy_array[0, 0] / batch_size data.mult_by_row(t4, target=t6) negdata.mult_by_row(t4, target=t7) negdata.subtract(t7) negdata.add(t6) hmc_ave_rej = 0.9 * hmc_ave_rej + 0.1 * rej if hmc_ave_rej < hmc_target_ave_rej: hmc_step = min(hmc_step * 1.01, 0.25) else: hmc_step = max(hmc_step * 0.99, .001) return hmc_step, hmc_ave_rej
def costAndGrad(self,data,labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt,_ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt,_ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1,T): self.hActs[i].minmax(0.0,self.maxAct,col=t-1) cm.mvdot_col_slice(wt,self.hActs[i],t-1,self.hActs[i],t,beta=1.0) self.hActs[i].minmax(0.0,self.maxAct,col=T-1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path(self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) if skip: return cost,self.grad,skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(stack): # compute gradient cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0]) deltasIn.sum(axis=1,target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T,deltasIn,target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0,self.maxAct,target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T-1,0,-1): # Add in temporal delta cm.mvdot_col_slice(wt.T,self.deltaTemp,t,deltasOut,t,beta=1.0) # Push through activation fn deltasOut.mult_slice(t,self.tmpGrad,t) self.deltaTemp.set_single_col(t-1,deltasOut,t) # Accumulate temporal gradient cm.dot(self.deltaTemp,self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T,self.deltaTemp,0,deltasOut,0,beta=1.0) deltasOut.mult_slice(0,self.tmpGrad,0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 return cost,self.grad,skip
def ff(x0_cpu): data_size = x0_cpu.shape[1] x_l0 = cm.empty((num_input, data_size)) x_l0.assign(cm.CUDAMatrix(x0_cpu)) x_l1 = cm.empty((num_hid, data_size)) cm.dot(w1.T, x_l0, target = x_l1) x_l1.add_col_vec(b1) x_l1.apply_sigmoid() x_l2 = cm.empty((num_hid, data_size)) del x_l0 cm.dot(w2.T, x_l1, target = x_l2) x_l2.add_col_vec(b2) x_l2.apply_sigmoid() x_l3 = cm.empty((num_hid, data_size)) del x_l1 cm.dot(w3.T, x_l2, target = x_l3) x_l3.add_col_vec(b3) x_l3.apply_sigmoid() x_l4 = cm.empty((num_hid, data_size)) del x_l2 cm.dot(w4.T, x_l3, target = x_l4) x_l4.add_col_vec(b4) x_l4.apply_sigmoid() x_l5 = cm.empty((num_hid, data_size)) del x_l3 cm.dot(w5.T, x_l4, target = x_l5) x_l5.add_col_vec(b5) x_l5.apply_sigmoid() x_output = cm.empty((num_output, data_size)) del x_l4 tmp_x_output = cm.empty((num_output, data_size)) tmp_x_output_sums = cm.empty((1, data_size)) cm.dot(wo.T, x_l5, target = tmp_x_output) tmp_x_output.add_col_vec(bo) cm.exp(tmp_x_output) tmp_x_output.sum(axis=0, target = tmp_x_output_sums) tmp_x_output_sums.reciprocal() tmp_x_output.mult_by_row(tmp_x_output_sums) x_output.assign(tmp_x_output) x_output.mult_by_col(state_prior_gpu_rec) cm.log(x_output) x_output.mult(1./np.log(10)) xo = x_output.asarray() return xo
def sinkhorn(a, b, M_GPU, reg, numItermax=1000, stopThr=1e-9, verbose=False, log=False, returnAsGPU=False): r""" Solve the entropic regularization optimal transport problem on GPU The function solves the following optimization problem: .. math:: \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega(\gamma) s.t. \gamma 1 = a \gamma^T 1= b \gamma\geq 0 where : - M is the (ns,nt) metric cost matrix - :math:`\Omega` is the entropic regularization term :math:`\Omega(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - a and b are source and target weights (sum to 1) The algorithm used for solving the problem is the Sinkhorn-Knopp matrix scaling algorithm as proposed in [2]_ Parameters ---------- a : np.ndarray (ns,) samples weights in the source domain b : np.ndarray (nt,) samples in the target domain M_GPU : cudamat.CUDAMatrix (ns,nt) loss matrix reg : float Regularization term >0 numItermax : int, optional Max number of iterations stopThr : float, optional Stop threshol on error (>0) verbose : bool, optional Print information along iterations log : bool, optional record log if True returnAsGPU : bool, optional return the OT matrix as a cudamat.CUDAMatrix Returns ------- gamma : (ns x nt) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters Examples -------- >>> import ot >>> a=[.5,.5] >>> b=[.5,.5] >>> M=[[0.,1.],[1.,0.]] >>> ot.sinkhorn(a,b,M,1) array([[ 0.36552929, 0.13447071], [ 0.13447071, 0.36552929]]) References ---------- .. [2] M. Cuturi, Sinkhorn Distances : Lightspeed Computation of Optimal Transport, Advances in Neural Information Processing Systems (NIPS) 26, 2013 See Also -------- ot.lp.emd : Unregularized OT ot.optim.cg : General regularized OT """ # init data Nini = len(a) Nfin = len(b) if log: log = {'err': []} # we assume that no distances are null except those of the diagonal of # distances u = (np.ones(Nini) / Nini).reshape((Nini, 1)) u_GPU = cudamat.CUDAMatrix(u) a_GPU = cudamat.CUDAMatrix(a.reshape((Nini, 1))) ones_GPU = cudamat.empty(u_GPU.shape).assign(1) v = (np.ones(Nfin) / Nfin).reshape((Nfin, 1)) v_GPU = cudamat.CUDAMatrix(v) b_GPU = cudamat.CUDAMatrix(b.reshape((Nfin, 1))) M_GPU.divide(-reg) K_GPU = cudamat.exp(M_GPU) ones_GPU.divide(a_GPU, target=a_GPU) Kp_GPU = cudamat.empty(K_GPU.shape) K_GPU.mult_by_col(a_GPU, target=Kp_GPU) tmp_GPU = cudamat.empty(K_GPU.shape) cpt = 0 err = 1 while (err > stopThr and cpt < numItermax): uprev_GPU = u_GPU.copy() vprev_GPU = v_GPU.copy() KtransposeU_GPU = K_GPU.transpose().dot(u_GPU) b_GPU.divide(KtransposeU_GPU, target=v_GPU) ones_GPU.divide(Kp_GPU.dot(v_GPU), target=u_GPU) if (np.any(KtransposeU_GPU.asarray() == 0) or not u_GPU.allfinite() or not v_GPU.allfinite()): # we have reached the machine precision # come back to previous solution and quit loop print('Warning: numerical errors at iteration', cpt) u_GPU = uprev_GPU.copy() v_GPU = vprev_GPU.copy() break if cpt % 10 == 0: # we can speed up the process by checking for the error only all # the 10th iterations K_GPU.mult_by_col(u_GPU, target=tmp_GPU) tmp_GPU.mult_by_row(v_GPU.transpose(), target=tmp_GPU) bcopy_GPU = b_GPU.copy().transpose() bcopy_GPU.add_sums(tmp_GPU, axis=0, beta=-1) err = bcopy_GPU.euclid_norm()**2 if log: log['err'].append(err) if verbose: if cpt % 200 == 0: print( '{:5s}|{:12s}'.format('It.', 'Err') + '\n' + '-' * 19) print('{:5d}|{:8e}|'.format(cpt, err)) cpt += 1 if log: log['u'] = u_GPU.asarray() log['v'] = v_GPU.asarray() K_GPU.mult_by_col(u_GPU, target=K_GPU) K_GPU.mult_by_row(v_GPU.transpose(), target=K_GPU) if returnAsGPU: res = K_GPU else: res = K_GPU.asarray() if log: return res, log else: return res
def ff(x0_cpu): data_size = x0_cpu.shape[1] x_l0 = cm.empty((num_input, data_size)) x_l0.assign(cm.CUDAMatrix(x0_cpu)) x_l1 = cm.empty((num_hid, data_size)) cm.dot(w1.T, x_l0, target=x_l1) x_l1.add_col_vec(b1) x_l1.apply_sigmoid() x_l2 = cm.empty((num_hid, data_size)) del x_l0 cm.dot(w2.T, x_l1, target=x_l2) x_l2.add_col_vec(b2) x_l2.apply_sigmoid() x_l3 = cm.empty((num_hid, data_size)) del x_l1 cm.dot(w3.T, x_l2, target=x_l3) x_l3.add_col_vec(b3) x_l3.apply_sigmoid() x_l4 = cm.empty((num_hid, data_size)) del x_l2 cm.dot(w4.T, x_l3, target=x_l4) x_l4.add_col_vec(b4) x_l4.apply_sigmoid() x_l5 = cm.empty((num_hid, data_size)) del x_l3 cm.dot(w5.T, x_l4, target=x_l5) x_l5.add_col_vec(b5) x_l5.apply_sigmoid() x_output = cm.empty((num_output, data_size)) del x_l4 tmp_x_output = cm.empty((num_output, data_size)) tmp_x_output_sums = cm.empty((1, data_size)) cm.dot(wo.T, x_l5, target=tmp_x_output) tmp_x_output.add_col_vec(bo) cm.exp(tmp_x_output) tmp_x_output.sum(axis=0, target=tmp_x_output_sums) tmp_x_output_sums.reciprocal() tmp_x_output.mult_by_row(tmp_x_output_sums) x_output.assign(tmp_x_output) x_output.mult_by_col(state_prior_gpu_rec) cm.log(x_output) x_output.mult(1. / np.log(10)) xo = x_output.asarray() return xo
def score_samples(self, X, temp_gpu_mem=None): '''Return the per-sample likelihood of the data under the model. Compute the log probability of X under the model and return the posterior probability of each mixture component for each element of X. Parameters ---------- X: numpy.ndarray, shape (n_samples, n_dimensions) Array of n_samples data points. Each row corresponds to a single data point. Returns ------- logprob_Nx1 : array_like, shape (n_samples,) Log probabilities of each data point in X. posteriors : array_like, shape (n_samples, n_components) Posterior probability of each mixture component for each sample ''' if None in (self.weights, self.means, self.covars): raise ValueError('GMM parameters have not been initialized') if X.shape[1] != self.n_dimensions: raise ValueError( 'input data matrix X is of shape %s, should be %s' % (X.shape, (X.shape[0], self.n_dimensions))) N = X.shape[0] if temp_gpu_mem is None: temp_gpu_mem = TempGPUMem() temp_gpu_mem.alloc(N, self.n_components, self.n_dimensions) # lpr = log_multivariate_normal_density() # + np.log(self.weights)[None, :] # ----------------------------------------------------- posteriors_NxK = log_multivariate_normal_density( X, self.means, self.covars, self.covariance_type, temp_gpu_mem) # lpr += np.log(self.weights) temp_Kx1 = temp_gpu_mem['temp_Kx1'] cm.log(self.weights, target=temp_Kx1) temp_Kx1.reshape((1, self.n_components)) # transpose posteriors_NxK.add_row_vec(temp_Kx1) temp_Kx1.reshape((self.n_components, 1)) # original shape # in use: lpr -> 'NxK' # logprob_Nx1 = np.log(np.sum(np.exp(lpr - vmax), axis=1)) # logprob_Nx1 += vmax # --------------------------------------------------------- vmax_Nx1 = temp_gpu_mem['vmax_Nx1'] logprob_Nx1 = temp_gpu_mem['logprob_Nx1'] # vmax_Nx1 = np.max(lpr, axis=1) posteriors_NxK.max(axis=1, target=vmax_Nx1) # lpr -= vmax_Nx1[:, None] posteriors_NxK.add_col_mult(vmax_Nx1, -1.0) # posteriors_NxK = np.exp(posteriors_NxK) cm.exp(posteriors_NxK) # logprob_Nx1 = np.sum(posteriors_NxK, axis=1) posteriors_NxK.sum(axis=1, target=logprob_Nx1) # posteriors_NxK /= logprob_Nx1[:, None] posteriors_NxK.div_by_col(logprob_Nx1) # logprob_Nx1 = np.log(logprob_Nx1) cm.log(logprob_Nx1, target=logprob_Nx1) # logprob_Nx1 += vmax_Nx1 logprob_Nx1.add(vmax_Nx1) return logprob_Nx1, posteriors_NxK
def rbm_update(self,gpu_data,gpu_target_for_data=None,n_first_update=None): is_labeled = gpu_target_for_data != None if n_first_update != None: gpu_data.slice(n_first_update,self.minibatch_size).assign(0) self.dW.mult(self.momentum) self.dU.mult(self.momentum) self.dc.mult(self.momentum) self.db.mult(self.momentum) self.dd.mult(self.momentum) # Computes p(y|x). This methods fills in # self.gpu_p_y_given_x and self.gpu_p_y_given_x_trans with the result. # It also computes self.gpu_act_from_x. self.compute_output(gpu_data) if gpu_target_for_data != None: # Compute discriminative gradient self.gpu_p_y_given_x_trans.subtract(gpu_target_for_data,self.gpu_doutput) if n_first_update != None: # Making sure gradient is non-zero only for n_first_update first examples self.gpu_doutput.slice(n_first_update,self.minibatch_size).assign(0) self.gpu_doutput.sum(axis=1,target=self.gpu_doutput_sum) self.dd.add_dot(self.gpu_target_vectors,self.gpu_doutput_sum) self.gpu_dhidact.assign(0) self.gpu_doutput.transpose(self.gpu_doutput_trans) for c in range(self.n_classes): cm.dot(self.U,self.gpu_target_vectors.slice(c,c+1),self.gpu_act_from_y) # to avoid memory creation, using gpu_h # and gpu_h_sample for these computations self.gpu_act_from_x.add_col_vec(self.gpu_act_from_y,target=self.gpu_h) self.gpu_h.apply_sigmoid() self.gpu_doutput_trans.slice(c,c+1).transpose(target=self.gpu_doutput_row) self.gpu_h.mult_by_row(self.gpu_doutput_row) self.gpu_dhidact.add(self.gpu_h) self.dc.add_sums(self.gpu_h,axis=1) self.gpu_h.sum(axis=1,target=self.gpu_dhidact_sum) self.dU.add_dot(self.gpu_dhidact_sum,self.gpu_target_vectors.slice(c,c+1).T) self.dW.add_dot(self.gpu_dhidact,gpu_data.T) else: # Sample a y according to p(y|x) # ... actually, we use the softmax probs, it's much simpler gpu_target_for_data = self.gpu_p_y_given_x_trans if (is_labeled and self.gen_learning_weight > 0) or (not is_labeled and self.semisup_learning_weight > 0): self.cd_W.assign(0) self.cd_U.assign(0) self.cd_c.assign(0) self.cd_b.assign(0) self.cd_d.assign(0) # Positive phase cm.dot(self.W,gpu_data,self.gpu_h) self.gpu_h.add_col_vec(self.c) cm.dot(self.gpu_target_vectors,gpu_target_for_data,self.gpu_target_vec_pos) self.gpu_h.add_dot(self.U,self.gpu_target_vec_pos) self.gpu_h.apply_sigmoid() if n_first_update != None: # A simple fix for having a non-zero gradient only for n_first_update examples self.gpu_target_vec_pos.slice(n_first_update,self.minibatch_size).assign(0) self.gpu_h.slice(n_first_update,self.minibatch_size).assign(0) self.cd_W.subtract_dot(self.gpu_h,gpu_data.T) self.cd_U.subtract_dot(self.gpu_h,self.gpu_target_vec_pos.T) self.cd_c.add_sums(self.gpu_h,axis=1,mult=-1.) self.cd_b.add_sums(gpu_data,axis=1,mult=-1.) self.cd_d.add_sums(self.gpu_target_vec_pos,axis=1,mult=-1.) if self.use_persistent_chain: cm.dot(self.W,self.gpu_x_persistent,self.gpu_h) self.gpu_h.add_col_vec(self.c) self.gpu_h.add_dot(self.U,self.gpu_y_persistent) self.gpu_h.apply_sigmoid() for it in range(self.n_gibbs_steps): self.gpu_h_sample.fill_with_rand() self.gpu_h_sample.less_than(self.gpu_h) # Down pass cm.dot(self.W.T,self.gpu_h_sample,self.gpu_x) self.gpu_x.add_col_vec(self.b) self.gpu_x.apply_sigmoid() self.gpu_x_sample.fill_with_rand() self.gpu_x_sample.less_than(self.gpu_x) cm.dot(self.U.T,self.gpu_h_sample,self.gpu_y) self.gpu_y.add_col_vec(self.d) cm.dot(self.gpu_target_vectors.T,self.gpu_y,self.gpu_target_vec_neg) self.gpu_target_vec_neg.transpose(self.gpu_y_trans) self.gpu_y_trans.sum(axis=1,target=self.gpu_y_trans_mean) self.gpu_y_trans_mean.divide(-self.n_classes) self.gpu_y_trans.add_col_vec(self.gpu_y_trans_mean) cm.exp(self.gpu_y_trans,target=self.gpu_y_trans) self.gpu_y_trans.sum(axis=1,target=self.gpu_y_trans_norm) for c in range(self.n_classes): self.gpu_y_trans.slice(c,c+1).divide(self.gpu_y_trans_norm) self.gpu_y_trans.transpose(self.gpu_y) # Up pass cm.dot(self.W,self.gpu_x_sample,self.gpu_h) self.gpu_h.add_col_vec(self.c) cm.dot(self.gpu_target_vectors,self.gpu_y,self.gpu_target_vec_neg) self.gpu_h.add_dot(self.U,self.gpu_target_vec_neg) self.gpu_h.apply_sigmoid() if self.use_persistent_chain: # Remember Gibbs chain's state self.gpu_x_persistent.assign(self.gpu_x_sample) self.gpu_y_persistent.assign(self.gpu_target_vec_neg) if n_first_update != None: self.gpu_x_sample.slice(n_first_update,self.minibatch_size).assign(0) self.gpu_target_vec_neg.slice(n_first_update,self.minibatch_size).assign(0) self.gpu_h.slice(n_first_update,self.minibatch_size).assign(0) self.cd_W.add_dot(self.gpu_h,self.gpu_x_sample.T) self.cd_U.add_dot(self.gpu_h,self.gpu_target_vec_neg.T) self.cd_c.add_sums(self.gpu_h,axis=1) self.cd_b.add_sums(self.gpu_x_sample,axis=1) self.cd_d.add_sums(self.gpu_target_vec_neg,axis=1) # Update RBM if is_labeled: alpha = self.gen_learning_weight else: alpha = self.semisup_learning_weight self.dW.add_mult(self.cd_W,alpha=alpha) self.dU.add_mult(self.cd_U,alpha=alpha) self.dc.add_mult(self.cd_c,alpha=alpha) self.db.add_mult(self.cd_b,alpha=alpha) self.dd.add_mult(self.cd_d,alpha=alpha) if n_first_update == None: lr = self.learning_rate/self.minibatch_size else: lr = self.learning_rate/n_first_update self.W.add_mult(self.dW,alpha=-lr) self.U.add_mult(self.dU,alpha=-lr) self.c.add_mult(self.dc,alpha=-lr) self.b.add_mult(self.db,alpha=-lr) self.d.add_mult(self.dd,alpha=-lr)
def rbm_update(self, gpu_data, gpu_target_for_data=None, n_first_update=None): is_labeled = gpu_target_for_data != None if n_first_update != None: gpu_data.slice(n_first_update, self.minibatch_size).assign(0) self.dW.mult(self.momentum) self.dU.mult(self.momentum) self.dc.mult(self.momentum) self.db.mult(self.momentum) self.dd.mult(self.momentum) # Computes p(y|x). This methods fills in # self.gpu_p_y_given_x and self.gpu_p_y_given_x_trans with the result. # It also computes self.gpu_act_from_x. self.compute_output(gpu_data) if gpu_target_for_data != None: # Compute discriminative gradient self.gpu_p_y_given_x_trans.subtract(gpu_target_for_data, self.gpu_doutput) if n_first_update != None: # Making sure gradient is non-zero only for n_first_update first examples self.gpu_doutput.slice(n_first_update, self.minibatch_size).assign(0) self.gpu_doutput.sum(axis=1, target=self.gpu_doutput_sum) self.dd.add_dot(self.gpu_target_vectors, self.gpu_doutput_sum) self.gpu_dhidact.assign(0) self.gpu_doutput.transpose(self.gpu_doutput_trans) for c in range(self.n_classes): cm.dot(self.U, self.gpu_target_vectors.slice(c, c + 1), self.gpu_act_from_y) # to avoid memory creation, using gpu_h # and gpu_h_sample for these computations self.gpu_act_from_x.add_col_vec(self.gpu_act_from_y, target=self.gpu_h) self.gpu_h.apply_sigmoid() self.gpu_doutput_trans.slice( c, c + 1).transpose(target=self.gpu_doutput_row) self.gpu_h.mult_by_row(self.gpu_doutput_row) self.gpu_dhidact.add(self.gpu_h) self.dc.add_sums(self.gpu_h, axis=1) self.gpu_h.sum(axis=1, target=self.gpu_dhidact_sum) self.dU.add_dot(self.gpu_dhidact_sum, self.gpu_target_vectors.slice(c, c + 1).T) self.dW.add_dot(self.gpu_dhidact, gpu_data.T) else: # Sample a y according to p(y|x) # ... actually, we use the softmax probs, it's much simpler gpu_target_for_data = self.gpu_p_y_given_x_trans if (is_labeled and self.gen_learning_weight > 0) or ( not is_labeled and self.semisup_learning_weight > 0): self.cd_W.assign(0) self.cd_U.assign(0) self.cd_c.assign(0) self.cd_b.assign(0) self.cd_d.assign(0) # Positive phase cm.dot(self.W, gpu_data, self.gpu_h) self.gpu_h.add_col_vec(self.c) cm.dot(self.gpu_target_vectors, gpu_target_for_data, self.gpu_target_vec_pos) self.gpu_h.add_dot(self.U, self.gpu_target_vec_pos) self.gpu_h.apply_sigmoid() if n_first_update != None: # A simple fix for having a non-zero gradient only for n_first_update examples self.gpu_target_vec_pos.slice(n_first_update, self.minibatch_size).assign(0) self.gpu_h.slice(n_first_update, self.minibatch_size).assign(0) self.cd_W.subtract_dot(self.gpu_h, gpu_data.T) self.cd_U.subtract_dot(self.gpu_h, self.gpu_target_vec_pos.T) self.cd_c.add_sums(self.gpu_h, axis=1, mult=-1.) self.cd_b.add_sums(gpu_data, axis=1, mult=-1.) self.cd_d.add_sums(self.gpu_target_vec_pos, axis=1, mult=-1.) if self.use_persistent_chain: cm.dot(self.W, self.gpu_x_persistent, self.gpu_h) self.gpu_h.add_col_vec(self.c) self.gpu_h.add_dot(self.U, self.gpu_y_persistent) self.gpu_h.apply_sigmoid() for it in range(self.n_gibbs_steps): self.gpu_h_sample.fill_with_rand() self.gpu_h_sample.less_than(self.gpu_h) # Down pass cm.dot(self.W.T, self.gpu_h_sample, self.gpu_x) self.gpu_x.add_col_vec(self.b) self.gpu_x.apply_sigmoid() self.gpu_x_sample.fill_with_rand() self.gpu_x_sample.less_than(self.gpu_x) cm.dot(self.U.T, self.gpu_h_sample, self.gpu_y) self.gpu_y.add_col_vec(self.d) cm.dot(self.gpu_target_vectors.T, self.gpu_y, self.gpu_target_vec_neg) self.gpu_target_vec_neg.transpose(self.gpu_y_trans) self.gpu_y_trans.sum(axis=1, target=self.gpu_y_trans_mean) self.gpu_y_trans_mean.divide(-self.n_classes) self.gpu_y_trans.add_col_vec(self.gpu_y_trans_mean) cm.exp(self.gpu_y_trans, target=self.gpu_y_trans) self.gpu_y_trans.sum(axis=1, target=self.gpu_y_trans_norm) for c in range(self.n_classes): self.gpu_y_trans.slice(c, c + 1).divide(self.gpu_y_trans_norm) self.gpu_y_trans.transpose(self.gpu_y) # Up pass cm.dot(self.W, self.gpu_x_sample, self.gpu_h) self.gpu_h.add_col_vec(self.c) cm.dot(self.gpu_target_vectors, self.gpu_y, self.gpu_target_vec_neg) self.gpu_h.add_dot(self.U, self.gpu_target_vec_neg) self.gpu_h.apply_sigmoid() if self.use_persistent_chain: # Remember Gibbs chain's state self.gpu_x_persistent.assign(self.gpu_x_sample) self.gpu_y_persistent.assign(self.gpu_target_vec_neg) if n_first_update != None: self.gpu_x_sample.slice(n_first_update, self.minibatch_size).assign(0) self.gpu_target_vec_neg.slice(n_first_update, self.minibatch_size).assign(0) self.gpu_h.slice(n_first_update, self.minibatch_size).assign(0) self.cd_W.add_dot(self.gpu_h, self.gpu_x_sample.T) self.cd_U.add_dot(self.gpu_h, self.gpu_target_vec_neg.T) self.cd_c.add_sums(self.gpu_h, axis=1) self.cd_b.add_sums(self.gpu_x_sample, axis=1) self.cd_d.add_sums(self.gpu_target_vec_neg, axis=1) # Update RBM if is_labeled: alpha = self.gen_learning_weight else: alpha = self.semisup_learning_weight self.dW.add_mult(self.cd_W, alpha=alpha) self.dU.add_mult(self.cd_U, alpha=alpha) self.dc.add_mult(self.cd_c, alpha=alpha) self.db.add_mult(self.cd_b, alpha=alpha) self.dd.add_mult(self.cd_d, alpha=alpha) if n_first_update == None: lr = self.learning_rate / self.minibatch_size else: lr = self.learning_rate / n_first_update self.W.add_mult(self.dW, alpha=-lr) self.U.add_mult(self.dU, alpha=-lr) self.c.add_mult(self.dc, alpha=-lr) self.b.add_mult(self.db, alpha=-lr) self.d.add_mult(self.dd, alpha=-lr)
def costAndGrad(self,data,labels=None, sentence=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-2] wtf,_ = self.stack[-2] wtb,_ = self.stack[-1] if self.train: grad = self.grad[:-2] dwtf,_ = self.grad[-2] dwtb,_ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop #TODO copy to device here self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: self.hActsFor.assign(self.hActs[i]) self.hActsBack.assign(self.hActs[i]) self.hActsFor.minmax(0.0,self.maxAct,col=0) self.hActsBack.minmax(0.0,self.maxAct,col=T-1) for t in xrange(1,T): cm.mvdot_col_slice(wtf,self.hActsFor,t-1,self.hActsFor,t,beta=1.0) self.hActsFor.minmax(0.0,self.maxAct,col=t) cm.mvdot_col_slice(wtb,self.hActsBack,T-t,self.hActsBack,T-t-1,beta=1.0) self.hActsBack.minmax(0.0,self.maxAct,col=T-t-1) self.hActsFor.add(self.hActsBack,target=self.hActs[i]) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: probs = self.probs.numpy_array return probs cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) if self.reg > 0: self.regcost = 0.0 for w, b in self.stack: rc = (self.reg / 2.0) * (w.euclid_norm() ** 2) self.regcost += rc cost = cost + rc if skip: return cost,self.grad,skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(stack): # compute gradient # gradient for w cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0]) if self.reg > 0: grad[i][0].add_mult(w, alpha=self.reg) # gradient for b deltasIn.sum(axis=1,target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T,deltasIn,target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActsFor.within(0.0,self.maxAct,target=self.tmpGradFor) self.hActsBack.within(0.0,self.maxAct,target=self.tmpGradBack) self.deltasFor.assign(deltasOut) self.deltasBack.assign(deltasOut) self.deltasFor.mult_slice(T-1,self.tmpGradFor,T-1) self.deltasBack.mult_slice(0,self.tmpGradBack,0) for t in xrange(1,T): # Add in temporal delta cm.mvdot_col_slice(wtf.T,self.deltasFor,T-t, self.deltasFor,T-t-1,beta=1.0) cm.mvdot_col_slice(wtb.T,self.deltasBack,t-1, self.deltasBack,t,beta=1.0) # Push through activation fn self.deltasFor.mult_slice(T-t-1,self.tmpGradFor,T-t-1) self.deltasBack.mult_slice(t,self.tmpGradBack,t) # Accumulate temporal gradient cm.dot(self.deltasFor.get_col_slice(1,T), self.hActsFor.get_col_slice(0,T-1).T,target=dwtf) cm.dot(self.deltasBack.get_col_slice(0,T-1), self.hActsBack.get_col_slice(1,T).T,target=dwtb) # Accumulate next layer deltas self.deltasFor.add(self.deltasBack,target=deltasOut) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 if self.reg > 0: if self.temporalLayer > 0: dwtf.add_mult(wtf, alpha=self.reg) dwtb.add_mult(wtb, alpha=self.reg) return cost,self.grad,skip
def costAndGrad(self, data, labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt, _ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt, _ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w, b in stack: cm.dot(w, self.hActs[i - 1], self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1, T): self.hActs[i].minmax(0.0, self.maxAct, col=t - 1) cm.mvdot_col_slice(wt, self.hActs[i], t - 1, self.hActs[i], t, beta=1.0) self.hActs[i].minmax(0.0, self.maxAct, col=T - 1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0, target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0, target=self.rowVec) cm.pow(self.rowVec, -1.0, target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path( self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype( np.float64), labels, blank=0) if skip: return cost, self.grad, skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn, deltasOut = self.deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0]) deltasIn.sum(axis=1, target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T, deltasIn, target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T - 1, 0, -1): # Add in temporal delta cm.mvdot_col_slice(wt.T, self.deltaTemp, t, deltasOut, t, beta=1.0) # Push through activation fn deltasOut.mult_slice(t, self.tmpGrad, t) self.deltaTemp.set_single_col(t - 1, deltasOut, t) # Accumulate temporal gradient cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T, self.deltaTemp, 0, deltasOut, 0, beta=1.0) deltasOut.mult_slice(0, self.tmpGrad, 0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 return cost, self.grad, skip