def log(self, x): self.tmp1.assign(x) self.tmp1.mult(-1) cm.exp(self.tmp1, target=self.tmp1) self.tmp1.add(1) cm.pow(self.tmp1, -1)
def costAndGrad(self,data,labels): batchSize = data.shape[1] self.setViews(batchSize) # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in self.stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) if i <= len(self.layerSizes): # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) self.deltasC.assign(cm.CUDAMatrix(deltas)) if skip: return cost,self.grad,skip # back prop nl = len(self.layerSizes) i = nl deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(self.stack): # compute gradient cm.dot(deltasIn,self.hActs[i].T,target=self.grad[i][0]) deltasIn.sum(axis=1,target=self.grad[i][1]) # compute next layer deltas if i > 0: self.hActs[i].sign(target=self.tmpGrad) cm.dot(w.T,deltasIn,target=deltasOut) deltasOut.mult(self.tmpGrad) if i == nl: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 return cost,self.grad,skip
def pairwiseEuclideanGPU(a, b, returnAsGPU=False, squared=False): """ Compute the pairwise euclidean distance between matrices a and b. Parameters ---------- a : np.ndarray (n, f) first matrice b : np.ndarray (m, f) second matrice returnAsGPU : boolean, optional (default False) if True, returns cudamat matrix still on GPU, else return np.ndarray squared : boolean, optional (default False) if True, return squared euclidean distance matrice Returns ------- c : (n x m) np.ndarray or cudamat.CUDAMatrix pairwise euclidean distance distance matrix """ # a is shape (n, f) and b shape (m, f). Return matrix c of shape (n, m). # First compute in c_GPU the squared euclidean distance. And return its # square root. At each cell [i,j] of c, we want to have # sum{k in range(f)} ( (a[i,k] - b[j,k])^2 ). We know that # (a-b)^2 = a^2 -2ab +b^2. Thus we want to have in each cell of c: # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2). a_GPU = cudamat.CUDAMatrix(a) b_GPU = cudamat.CUDAMatrix(b) # Multiply a by b transpose to obtain in each cell [i,j] of c the # value sum{k in range(f)} ( a[i,k]b[j,k] ) c_GPU = cudamat.dot(a_GPU, b_GPU.transpose()) # multiply by -2 to have sum{k in range(f)} ( -2a[i,k]b[j,k] ) c_GPU.mult(-2) # Compute the vectors of the sum of squared elements. a_GPU = cudamat.pow(a_GPU, 2).sum(axis=1) b_GPU = cudamat.pow(b_GPU, 2).sum(axis=1) # Add the vectors in each columns (respectivly rows) of c. # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] ) c_GPU.add_col_vec(a_GPU) # sum{k in range(f)} ( a[i,k]^2 -2a[i,k]b[j,k] +b[j,k]^2) c_GPU.add_row_vec(b_GPU.transpose()) if not squared: c_GPU = cudamat.sqrt(c_GPU) if returnAsGPU: return c_GPU else: return c_GPU.asarray()
def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10, numInnerItermax=200, stopInnerThr=1e-9, verbose=False, log=False): p = 0.5 epsilon = 1e-3 Nfin = len(b) indices_labels = [] classes = np.unique(labels_a) for c in classes: idxc, = np.where(labels_a == c) indices_labels.append(cudamat.CUDAMatrix(idxc.reshape(1, -1))) Mreg_GPU = cudamat.empty(M_GPU.shape) W_GPU = cudamat.empty(M_GPU.shape).assign(0) for cpt in range(numItermax): Mreg_GPU.assign(M_GPU) Mreg_GPU.add_mult(W_GPU, eta) transp_GPU = sinkhorn(a, b, Mreg_GPU, reg, numItermax=numInnerItermax, stopThr=stopInnerThr, returnAsGPU=True) # the transport has been computed. Check if classes are really # separated W_GPU.assign(1) W_GPU = W_GPU.transpose() for (i, c) in enumerate(classes): (_, nbRow) = indices_labels[i].shape tmpC_GPU = cudamat.empty((Nfin, nbRow)).assign(0) transp_GPU.transpose().select_columns(indices_labels[i], tmpC_GPU) majs_GPU = tmpC_GPU.sum(axis=1).add(epsilon) cudamat.pow(majs_GPU, (p - 1)) majs_GPU.mult(p) tmpC_GPU.assign(0) tmpC_GPU.add_col_vec(majs_GPU) W_GPU.set_selected_columns(indices_labels[i], tmpC_GPU) W_GPU = W_GPU.transpose() return transp_GPU.asarray()
def project_words_gpu(projection_matrix, similarity_matrix, kernel_name, hyperparam): import cudamat as cm if kernel_name == "poly": k = cm.pow(cm.CUDAMatrix(similarity_matrix), hyperparam) elif kernel_name == 'rbf': k = cm.exp((cm.pow(cm.CUDAMatrix(1 - similarity_matrix), 2)).mult(-hyperparam)) else: raise NotImplementedError(f'{kernel_name} not yet implemented for GPU') return cm.dot(k, cm.CUDAMatrix(projection_matrix)).asarray()
def test_pow(): m = 256 n = 128 a = np.array(np.random.randn(m, n)*20, dtype=np.float32, order='F') b = np.array(np.random.rand(m, n), dtype=np.float32, order='F') p = 2 c = a**p m1 = cm.CUDAMatrix(a) m2 = cm.CUDAMatrix(b) cm.pow(m1, p, target = m2) cm.pow(m1, p) m1.copy_to_host() m2.copy_to_host() assert np.max(np.abs(c - m1.numpy_array)) < 10**-3, "Error in cudamat.pow exceeded threshold" assert np.max(np.abs(c - m2.numpy_array)) < 10**-3, "Error in cudamat.pow exceeded threshold"
def Train(self,ref): #ref e o vetor de todas as sa # idas desejados no dado instante de tempo. #calcular o vetor de erros e = self.trainingError(ref) max_lambda = 0.9999 min_lambda = 0.999 #regularization mu = 1e-8 #holder = cm.CUDAMatrix(self.P.asarray()) for saida in range(self.n_out): #regularization step #cm.dot(self.P,self.P,target = holder) #holder.mult(mu) #self.P.subtract(holder) #end regularization step self.sigma_e = (1.0 - 1.0/(self.K_a * self.neu)) * self.sigma_e + (1.0 - (1.0 - 1.0/(self.K_a * self.neu))) * e[saida]**2 self.sigma_q = (cm.pow(cm.dot(cm.dot(self.a.T,self.P),self.a),2).mult((1.0 - (1.0 - 1.0/(self.K_a * self.neu)))).add((1.0 - 1.0/(self.K_a * self.neu)) * float(self.sigma_q))).asarray() self.sigma_v = (1.0 - 1.0/(self.K_b * self.neu)) * self.sigma_v + (1.0 - (1.0 - 1.0/(self.K_b * self.neu))) * e[saida]**2 self.forget_aux = (np.sqrt(self.sigma_q) * np.sqrt(self.sigma_v))/(1e-8 + abs(np.sqrt(self.sigma_e) - np.sqrt(self.sigma_v))) self.forget = np.atleast_2d(np.min([self.forget_aux,max_lambda])) #Transpose respective output view.. Theta = self.Wro.asarray()[saida,:] Theta = Theta.reshape([self.neu,1]) Theta = cm.CUDAMatrix(Theta) #MQR equations #the P equation step by step A = cm.dot(self.P,self.a) B = cm.dot(A,self.a.T) C = cm.dot(B,self.P) D = cm.dot(cm.dot(self.a.T,self.P),self.a).add(np.asscalar(self.forget)) self.P.subtract(C.divide(np.asscalar(D.asarray()))) self.P.divide(np.asscalar(self.forget)) #final update #error calculation Theta.subtract(cm.dot(self.P,self.a).mult(np.asscalar(e[saida]))) Theta = Theta.reshape([1,self.neu]) self.Wro.copy_to_host() self.Wro.numpy_array[saida,:] = Theta.asarray() self.Wro.copy_to_device()
def update(self, lr): if self.use_momentum: self.weights_update.mult(self.momentum) self.weights_update.subtract_mult(self.weights_grad, lr) self.weights.add(self.weights_update) if self.use_bias: self.biases_update.mult(self.momentum) self.biases_update.subtract_mult(self.biases_grad, lr) self.biases.add(self.biases_update) elif self.use_rmsprop: self.weights_rmsprop_cache.mult(self.rmsprop_dr) cm.pow(self.weights_grad, self.weights_grad_square) self.weights_grad_square.mult(1.0 - self.rmsprop_dr) self.weights_rmsprop_cache.add(self.weights_grad_square) self.weights_rmsprop_cache.add(1e-8) cm.sqrt(self.weights_rmsprop_cache) self.weights_grad.mult(lr).divide(self.weights_rmsprop_cache) self.weights.subtract(self.weights_grad) self.biases_rmsprop_cache.mult(self.rmsprop_dr) cm.pow(self.biases_grad, self.biases_grad_square) self.biases_grad_square.mult(1.0 - self.rmsprop_dr) self.biases_rmsprop_cache.add(self.biases_grad_square) self.biases_rmsprop_cache.add(1e-8) cm.sqrt(self.biases_rmsprop_cache) self.biases_grad.mult(lr).divide(self.biases_rmsprop_cache) self.biases.subtract(self.biases_grad) else: self.weights.subtract_mult(self.weights_grad, lr) if self.use_bias: self.biases.subtract_mult(self.biases_grad, lr) # Max-norm regularization. if self.use_max_norm: cm.pow(self.weights, 2, self.weights_square) self.weights_square.sum(0, self.weights_factor) cm.sqrt(self.weights_factor, self.weights_factor) # Avoid zero weight mags. self.weights_factor.add(1e-8) self.weights_factor.reciprocal().mult(self.max_norm_c) # Filter not factor greater than 1.0 self.weights_factor.less_than(1.0, self.weights_factor_mask) self.weights_factor.mult(self.weights_factor_mask) # Change 0.0 entry to 1.0. self.weights_factor_mask.less_than(1.0) self.weights_factor.add(self.weights_factor_mask) # Down scale over sized weights. self.weights.mult_by_row(self.weights_factor)
def sinkhorn_lpl1_mm(a, labels_a, b, M_GPU, reg, eta=0.1, numItermax=10, numInnerItermax=200, stopInnerThr=1e-9, verbose=False, log=False): """ Solve the entropic regularization optimal transport problem with nonconvex group lasso regularization The function solves the following optimization problem: .. math:: \gamma = arg\min_\gamma <\gamma,M>_F + reg\cdot\Omega_e(\gamma)+ \eta \Omega_g(\gamma) s.t. \gamma 1 = a \gamma^T 1= b \gamma\geq 0 where : - M is the (ns,nt) metric cost matrix - :math:`\Omega_e` is the entropic regularization term :math:`\Omega_e(\gamma)=\sum_{i,j} \gamma_{i,j}\log(\gamma_{i,j})` - :math:`\Omega_g` is the group lasso regulaization term :math:`\Omega_g(\gamma)=\sum_{i,c} \|\gamma_{i,\mathcal{I}_c}\|^{1/2}_1` where :math:`\mathcal{I}_c` are the index of samples from class c in the source domain. - a and b are source and target weights (sum to 1) The algorithm used for solving the problem is the generalised conditional gradient as proposed in [5]_ [7]_ Parameters ---------- a : np.ndarray (ns,) samples weights in the source domain labels_a : np.ndarray (ns,) labels of samples in the source domain b : np.ndarray (nt,) samples weights in the target domain M_GPU : cudamat.CUDAMatrix (ns,nt) loss matrix reg : float Regularization term for entropic regularization >0 eta : float, optional Regularization term for group lasso regularization >0 numItermax : int, optional Max number of iterations numInnerItermax : int, optional Max number of iterations (inner sinkhorn solver) stopInnerThr : float, optional Stop threshold on error (inner sinkhorn solver) (>0) verbose : bool, optional Print information along iterations log : bool, optional record log if True Returns ------- gamma : (ns x nt) ndarray Optimal transportation matrix for the given parameters log : dict log dictionary return only if log==True in parameters References ---------- .. [5] N. Courty; R. Flamary; D. Tuia; A. Rakotomamonjy, "Optimal Transport for Domain Adaptation," in IEEE Transactions on Pattern Analysis and Machine Intelligence , vol.PP, no.99, pp.1-1 .. [7] Rakotomamonjy, A., Flamary, R., & Courty, N. (2015). Generalized conditional gradient: analysis of convergence and applications. arXiv preprint arXiv:1510.06567. See Also -------- ot.lp.emd : Unregularized OT ot.bregman.sinkhorn : Entropic regularized OT ot.optim.cg : General regularized OT """ p = 0.5 epsilon = 1e-3 Nfin = len(b) indices_labels = [] classes = np.unique(labels_a) for c in classes: idxc, = np.where(labels_a == c) indices_labels.append(cudamat.CUDAMatrix(idxc.reshape(1, -1))) Mreg_GPU = cudamat.empty(M_GPU.shape) W_GPU = cudamat.empty(M_GPU.shape).assign(0) for cpt in range(numItermax): Mreg_GPU.assign(M_GPU) Mreg_GPU.add_mult(W_GPU, eta) transp_GPU = sinkhorn(a, b, Mreg_GPU, reg, numItermax=numInnerItermax, stopThr=stopInnerThr, returnAsGPU=True) # the transport has been computed. Check if classes are really # separated W_GPU.assign(1) W_GPU = W_GPU.transpose() for (i, c) in enumerate(classes): (_, nbRow) = indices_labels[i].shape tmpC_GPU = cudamat.empty((Nfin, nbRow)).assign(0) transp_GPU.transpose().select_columns(indices_labels[i], tmpC_GPU) majs_GPU = tmpC_GPU.sum(axis=1).add(epsilon) cudamat.pow(majs_GPU, (p - 1)) majs_GPU.mult(p) tmpC_GPU.assign(0) tmpC_GPU.add_col_vec(majs_GPU) W_GPU.set_selected_columns(indices_labels[i], tmpC_GPU) W_GPU = W_GPU.transpose() return transp_GPU.asarray()
def mult_with_derivative(self, target, activated_z): cm.pow(activated_z, 2, activated_z) activated_z.mult(-1).add(1) target.mult(activated_z)
def costAndGrad(self, data, labels=None, sentence=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-2] wtf, _ = self.stack[-2] wtb, _ = self.stack[-1] if self.train: grad = self.grad[:-2] dwtf, _ = self.grad[-2] dwtb, _ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop #TODO copy to device here self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w, b in stack: cm.dot(w, self.hActs[i - 1], self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: self.hActsFor.assign(self.hActs[i]) self.hActsBack.assign(self.hActs[i]) self.hActsFor.minmax(0.0, self.maxAct, col=0) self.hActsBack.minmax(0.0, self.maxAct, col=T - 1) for t in xrange(1, T): cm.mvdot_col_slice(wtf, self.hActsFor, t - 1, self.hActsFor, t, beta=1.0) self.hActsFor.minmax(0.0, self.maxAct, col=t) cm.mvdot_col_slice(wtb, self.hActsBack, T - t, self.hActsBack, T - t - 1, beta=1.0) self.hActsBack.minmax(0.0, self.maxAct, col=T - t - 1) self.hActsFor.add(self.hActsBack, target=self.hActs[i]) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0, target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0, target=self.rowVec) cm.pow(self.rowVec, -1.0, target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: probs = self.probs.numpy_array return probs cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype( np.float64), labels, blank=0) if self.reg > 0: self.regcost = 0.0 for w, b in self.stack: rc = (self.reg / 2.0) * (w.euclid_norm()**2) self.regcost += rc cost = cost + rc if skip: return cost, self.grad, skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn, deltasOut = self.deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient # gradient for w cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0]) if self.reg > 0: grad[i][0].add_mult(w, alpha=self.reg) # gradient for b deltasIn.sum(axis=1, target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T, deltasIn, target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActsFor.within(0.0, self.maxAct, target=self.tmpGradFor) self.hActsBack.within(0.0, self.maxAct, target=self.tmpGradBack) self.deltasFor.assign(deltasOut) self.deltasBack.assign(deltasOut) self.deltasFor.mult_slice(T - 1, self.tmpGradFor, T - 1) self.deltasBack.mult_slice(0, self.tmpGradBack, 0) for t in xrange(1, T): # Add in temporal delta cm.mvdot_col_slice(wtf.T, self.deltasFor, T - t, self.deltasFor, T - t - 1, beta=1.0) cm.mvdot_col_slice(wtb.T, self.deltasBack, t - 1, self.deltasBack, t, beta=1.0) # Push through activation fn self.deltasFor.mult_slice(T - t - 1, self.tmpGradFor, T - t - 1) self.deltasBack.mult_slice(t, self.tmpGradBack, t) # Accumulate temporal gradient cm.dot(self.deltasFor.get_col_slice(1, T), self.hActsFor.get_col_slice(0, T - 1).T, target=dwtf) cm.dot(self.deltasBack.get_col_slice(0, T - 1), self.hActsBack.get_col_slice(1, T).T, target=dwtb) # Accumulate next layer deltas self.deltasFor.add(self.deltasBack, target=deltasOut) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 if self.reg > 0: if self.temporalLayer > 0: dwtf.add_mult(wtf, alpha=self.reg) dwtb.add_mult(wtb, alpha=self.reg) return cost, self.grad, skip
def costAndGrad(self,data,labels=None, sentence=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-2] wtf,_ = self.stack[-2] wtb,_ = self.stack[-1] if self.train: grad = self.grad[:-2] dwtf,_ = self.grad[-2] dwtb,_ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop #TODO copy to device here self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: self.hActsFor.assign(self.hActs[i]) self.hActsBack.assign(self.hActs[i]) self.hActsFor.minmax(0.0,self.maxAct,col=0) self.hActsBack.minmax(0.0,self.maxAct,col=T-1) for t in xrange(1,T): cm.mvdot_col_slice(wtf,self.hActsFor,t-1,self.hActsFor,t,beta=1.0) self.hActsFor.minmax(0.0,self.maxAct,col=t) cm.mvdot_col_slice(wtb,self.hActsBack,T-t,self.hActsBack,T-t-1,beta=1.0) self.hActsBack.minmax(0.0,self.maxAct,col=T-t-1) self.hActsFor.add(self.hActsBack,target=self.hActs[i]) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: probs = self.probs.numpy_array return probs cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) if self.reg > 0: self.regcost = 0.0 for w, b in self.stack: rc = (self.reg / 2.0) * (w.euclid_norm() ** 2) self.regcost += rc cost = cost + rc if skip: return cost,self.grad,skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(stack): # compute gradient # gradient for w cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0]) if self.reg > 0: grad[i][0].add_mult(w, alpha=self.reg) # gradient for b deltasIn.sum(axis=1,target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T,deltasIn,target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActsFor.within(0.0,self.maxAct,target=self.tmpGradFor) self.hActsBack.within(0.0,self.maxAct,target=self.tmpGradBack) self.deltasFor.assign(deltasOut) self.deltasBack.assign(deltasOut) self.deltasFor.mult_slice(T-1,self.tmpGradFor,T-1) self.deltasBack.mult_slice(0,self.tmpGradBack,0) for t in xrange(1,T): # Add in temporal delta cm.mvdot_col_slice(wtf.T,self.deltasFor,T-t, self.deltasFor,T-t-1,beta=1.0) cm.mvdot_col_slice(wtb.T,self.deltasBack,t-1, self.deltasBack,t,beta=1.0) # Push through activation fn self.deltasFor.mult_slice(T-t-1,self.tmpGradFor,T-t-1) self.deltasBack.mult_slice(t,self.tmpGradBack,t) # Accumulate temporal gradient cm.dot(self.deltasFor.get_col_slice(1,T), self.hActsFor.get_col_slice(0,T-1).T,target=dwtf) cm.dot(self.deltasBack.get_col_slice(0,T-1), self.hActsBack.get_col_slice(1,T).T,target=dwtb) # Accumulate next layer deltas self.deltasFor.add(self.deltasBack,target=deltasOut) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 if self.reg > 0: if self.temporalLayer > 0: dwtf.add_mult(wtf, alpha=self.reg) dwtb.add_mult(wtb, alpha=self.reg) return cost,self.grad,skip
def costAndGrad(self, data, labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt, _ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt, _ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w, b in stack: cm.dot(w, self.hActs[i - 1], self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1, T): self.hActs[i].minmax(0.0, self.maxAct, col=t - 1) cm.mvdot_col_slice(wt, self.hActs[i], t - 1, self.hActs[i], t, beta=1.0) self.hActs[i].minmax(0.0, self.maxAct, col=T - 1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0, target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec, -1.0, target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0, target=self.rowVec) cm.pow(self.rowVec, -1.0, target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path( self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype( np.float64), labels, blank=0) if skip: return cost, self.grad, skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn, deltasOut = self.deltasC, self.deltasOut for w, b in reversed(stack): # compute gradient cm.dot(deltasIn, self.hActs[i].T, target=grad[i][0]) deltasIn.sum(axis=1, target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T, deltasIn, target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0, self.maxAct, target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T - 1, 0, -1): # Add in temporal delta cm.mvdot_col_slice(wt.T, self.deltaTemp, t, deltasOut, t, beta=1.0) # Push through activation fn deltasOut.mult_slice(t, self.tmpGrad, t) self.deltaTemp.set_single_col(t - 1, deltasOut, t) # Accumulate temporal gradient cm.dot(self.deltaTemp, self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T, self.deltaTemp, 0, deltasOut, 0, beta=1.0) deltasOut.mult_slice(0, self.tmpGrad, 0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn, deltasOut = deltasOut, deltasIn i -= 1 return cost, self.grad, skip
def costAndGrad(self,data,labels=None): T = data.shape[1] self.setViews(T) if self.temporalLayer > 0: stack = self.stack[:-1] wt,_ = self.stack[-1] if self.train: grad = self.grad[:-1] dwt,_ = self.grad[-1] else: stack = self.stack if self.train: grad = self.grad # forward prop self.hActs[0].assign(cm.CUDAMatrix(data)) i = 1 for w,b in stack: cm.dot(w,self.hActs[i-1],self.hActs[i]) self.hActs[i].add_col_vec(b) # forward prop through time if i == self.temporalLayer: for t in xrange(1,T): self.hActs[i].minmax(0.0,self.maxAct,col=t-1) cm.mvdot_col_slice(wt,self.hActs[i],t-1,self.hActs[i],t,beta=1.0) self.hActs[i].minmax(0.0,self.maxAct,col=T-1) if i <= self.numLayers and i != self.temporalLayer: # hard relu self.hActs[i].maximum(0.0) i += 1 # Subtract max activation self.hActs[-1].max(axis=0,target=self.rowVec) self.hActs[-1].add_row_mult(self.rowVec,-1.0,target=self.probs) # Softmax cm.exp(self.probs) self.probs.sum(axis=0,target=self.rowVec) cm.pow(self.rowVec,-1.0,target=self.rowVec) self.probs.mult_by_row(self.rowVec) self.probs.copy_to_host() if not self.train: return ctc.decode_best_path(self.probs.numpy_array.astype(np.float64)) cost, deltas, skip = ctc.ctc_loss(self.probs.numpy_array.astype(np.float64), labels,blank=0) if skip: return cost,self.grad,skip self.deltasC.assign(cm.CUDAMatrix(deltas)) # back prop i = self.numLayers deltasIn,deltasOut = self.deltasC,self.deltasOut for w,b in reversed(stack): # compute gradient cm.dot(deltasIn,self.hActs[i].T,target=grad[i][0]) deltasIn.sum(axis=1,target=grad[i][1]) # compute next layer deltas if i > 0: cm.dot(w.T,deltasIn,target=deltasOut) # backprop through time if i == self.temporalLayer: self.hActs[i].within(0.0,self.maxAct,target=self.tmpGrad) self.deltaTemp.assign(0.0) for t in xrange(T-1,0,-1): # Add in temporal delta cm.mvdot_col_slice(wt.T,self.deltaTemp,t,deltasOut,t,beta=1.0) # Push through activation fn deltasOut.mult_slice(t,self.tmpGrad,t) self.deltaTemp.set_single_col(t-1,deltasOut,t) # Accumulate temporal gradient cm.dot(self.deltaTemp,self.hActs[i].T, target=dwt) cm.mvdot_col_slice(wt.T,self.deltaTemp,0,deltasOut,0,beta=1.0) deltasOut.mult_slice(0,self.tmpGrad,0) if i > 0 and i != self.temporalLayer: self.hActs[i].sign(target=self.tmpGrad) deltasOut.mult(self.tmpGrad) if i == self.numLayers: deltasIn = self.deltasIn deltasIn,deltasOut = deltasOut,deltasIn i -= 1 return cost,self.grad,skip
def matrix_factorization_clustering(X_aux, k, l, norm=False, num_iters=100): cm.cublas_init() m, n = X_aux.shape U = cm.CUDAMatrix(np.random.rand(m, k)) S = cm.CUDAMatrix(np.random.rand(k, l)) V = cm.CUDAMatrix(np.random.rand(n, l)) X = cm.CUDAMatrix(X_aux) # if norm: # X = Normalizer().fit_transform(X) XV = cm.CUDAMatrix(np.random.rand(m, l)) XVSt = cm.CUDAMatrix(np.random.rand(m, k)) US = cm.CUDAMatrix(np.random.rand(m, l)) USVt = cm.CUDAMatrix(np.random.rand(m, n)) USVtXt = cm.CUDAMatrix(np.random.rand(m, m)) USVtXtU = cm.CUDAMatrix(np.random.rand(m, k)) U_aux = cm.CUDAMatrix(np.random.rand(m, k)) XtUS = cm.CUDAMatrix(np.random.rand(m, l)) VSt = cm.CUDAMatrix(np.random.rand(n, k)) VStUt = cm.CUDAMatrix(np.random.rand(n, m)) UtX = cm.CUDAMatrix(np.random.rand(k, n)) VStUtXV = cm.CUDAMatrix(np.random.rand(n, l)) V_aux = cm.CUDAMatrix(np.random.rand(n, l)) UtXV = cm.CUDAMatrix(np.random.rand(k, l)) UtUS = cm.CUDAMatrix(np.random.rand(k, l)) UtUSVt = cm.CUDAMatrix(np.random.rand(k, n)) UtUSVtV = cm.CUDAMatrix(np.random.rand(k, l)) S_aux = cm.CUDAMatrix(np.random.rand(k, l)) error_best = np.inf error = np.inf for i in range(num_iters): # compute U cm.dot(X, V, target=XV) cm.dot(XV, S.T, target=XVSt) if i is 0: cm.dot(U, S, target=US) cm.dot(US, V.T, target=USVt) cm.dot(USVt, X.T, target=USVtXt) cm.dot(USVtXt, U, target=USVtXtU) cm.divide(XVSt, USVtXtU, U_aux) cm.mult(U, U_aux, U) # compute V cm.dot(U, S, target=US) cm.dot(X.T, US, target=XtUS) cm.dot(V, S.T, target=VSt) cm.dot(VSt, U.T, target=VStUt) cm.dot(VStUt, XV, target=VStUtXV) cm.divide(XtUS, VStUtXV, target=V_aux) cm.mult(V, V_aux, V) # compute S cm.dot(U.T, X, target=UtX) cm.dot(UtX, V, target=UtXV) cm.dot(U.T, US, target=UtUS) cm.dot(UtUS, V.T, UtUSVt) cm.dot(UtUSVt, V, target=UtUSVtV) cm.divide(UtXV, UtUSVtV, target=S_aux) cm.mult(S, S_aux, target=S) error_ant = error cm.dot(U, S, target=US) cm.dot(US, V.T, target=USVt) error = cm.sum(cm.pow(cm.subtract(X, USVt), 2), axis=0) if error < error_best: U_best_cm = U S_best_cm = S V_best_cm = V error_best = error if np.abs(error - error_ant) <= 0.000001: break U_best = U_best_cm.asarray() S_best = S_best_cm.asarray() V_best = V_best_cm.asarray() Du = np.diag(np.ones(m).dot(U_best)) Dv = np.diag(np.ones(n).dot(V_best)) U_norm = U_best.dot(np.diag(S_best.dot(Dv).dot(np.ones(l)))) V_norm = V_best.dot(np.diag(np.ones(k).dot(Du).dot(S_best))) rows_ind = np.argmax(U_best, axis=1) cols_ind = np.argmax(V_best, axis=1) cm.shutdown() return U_norm, S_best, V_norm, rows_ind, cols_ind, error_best
def matrix_factorization_clustering(X_aux, k, l, norm=False, num_iters=100): cm.cublas_init() m, n = X_aux.shape U = cm.CUDAMatrix(np.random.rand(m, k)) S = cm.CUDAMatrix(np.random.rand(k, l)) V = cm.CUDAMatrix(np.random.rand(n, l)) X = cm.CUDAMatrix(X_aux) # if norm: # X = Normalizer().fit_transform(X) XV = cm.CUDAMatrix(np.random.rand(m, l)) XVSt = cm.CUDAMatrix(np.random.rand(m, k)) US = cm.CUDAMatrix(np.random.rand(m, l)) USVt = cm.CUDAMatrix(np.random.rand(m, n)) USVtXt = cm.CUDAMatrix(np.random.rand(m, m)) USVtXtU = cm.CUDAMatrix(np.random.rand(m, k)) U_aux = cm.CUDAMatrix(np.random.rand(m, k)) XtUS = cm.CUDAMatrix(np.random.rand(m, l)) VSt = cm.CUDAMatrix(np.random.rand(n, k)) VStUt = cm.CUDAMatrix(np.random.rand(n, m)) UtX = cm.CUDAMatrix(np.random.rand(k, n)) VStUtXV = cm.CUDAMatrix(np.random.rand(n, l)) V_aux = cm.CUDAMatrix(np.random.rand(n, l)) UtXV = cm.CUDAMatrix(np.random.rand(k, l)) UtUS = cm.CUDAMatrix(np.random.rand(k, l)) UtUSVt = cm.CUDAMatrix(np.random.rand(k, n)) UtUSVtV = cm.CUDAMatrix(np.random.rand(k, l)) S_aux = cm.CUDAMatrix(np.random.rand(k, l)) error_best = np.inf error = np.inf for i in range(num_iters): # compute U cm.dot(X, V, target=XV) cm.dot(XV, S.T, target=XVSt) if i is 0: cm.dot(U, S, target=US) cm.dot(US, V.T, target=USVt) cm.dot(USVt, X.T, target=USVtXt) cm.dot(USVtXt, U, target=USVtXtU) cm.divide(XVSt, USVtXtU, U_aux) cm.mult(U, U_aux, U) # compute V cm.dot(U, S, target=US) cm.dot(X.T, US, target=XtUS) cm.dot(V, S.T, target=VSt) cm.dot(VSt, U.T, target=VStUt) cm.dot(VStUt, XV, target=VStUtXV) cm.divide(XtUS, VStUtXV, target=V_aux) cm.mult(V, V_aux, V) # compute S cm.dot(U.T, X, target=UtX) cm.dot(UtX, V, target=UtXV) cm.dot(U.T, US, target=UtUS) cm.dot(UtUS, V.T, UtUSVt) cm.dot(UtUSVt, V, target=UtUSVtV) cm.divide(UtXV, UtUSVtV, target=S_aux) cm.mult(S, S_aux, target=S) error_ant = error cm.dot(U, S, target=US) cm.dot(US, V.T, target=USVt) error = cm.sum(cm.pow(cm.subtract(X, USVt), 2), axis=0) if error < error_best: U_best_cm = U S_best_cm = S V_best_cm = V error_best = error if np.abs(error - error_ant) <= 0.000001: break U_best = U_best_cm.asarray() S_best = S_best_cm.asarray() V_best = V_best_cm.asarray() Du = np.diag(np.ones(m).dot(U_best)) Dv = np.diag(np.ones(n).dot(V_best)) U_norm = U_best.dot( np.diag(S_best.dot(Dv).dot(np.ones(l))) ) V_norm = V_best.dot( np.diag(np.ones(k).dot(Du).dot(S_best)) ) rows_ind = np.argmax(U_best, axis=1) cols_ind = np.argmax(V_best, axis=1) cm.shutdown() return U_norm, S_best, V_norm, rows_ind, cols_ind, error_best