def calculate_H_gpu(X, W, P): WPW = la.add_diag(P, la.dot(W, W, "t", "n")) tmp = la.dot(W, la.inv(WPW, overwrite=True)) H = la.dot(X, tmp, "n", "t") H = gpu.maximum(H, 0) H = to_unit_variance(H) return H, tmp
def backward(self, top, propagate_down, bottom): with pu.caffe_cuda_context(): h = caffe.cublas_handle() import scikits.cuda.linalg as linalg top_diff = top[0].diff_as_pycuda_gpuarray() ts = [self.t1_, self.t2_] for i in xrange(len(bottom)): if not propagate_down[i]: continue diff = bottom[i].diff_as_pycuda_gpuarray() data = bottom[(i + 1) % 2].data_as_pycuda_gpuarray() # Belew 3 conditions are complicated and might be hard to # understand. swap = ts[i] ^ bool(i) t1 = ts[i] t2 = (not t1) ^ ts[(i + 1) % 2] for b in xrange(bottom[0].shape[0]): x = top_diff[b] y = data[b] t1_, t2_ = t1, t2 if swap: x, y = y, x t1_, t2_ = t2_, t1_ linalg.dot(x, y, transa=blas_trans(t1_), transb=blas_trans(t2_), handle=h, out=diff[b])
def multinomial_log_likelihood(softmax_vals,Y,one_n_trans,one_c): # add small amount to protect against log(0) small_val = 1e-9 prod = Y*cumath.log(softmax_vals+small_val) prod = linalg.dot(one_n_trans,prod) prod = linalg.dot(prod,one_c) return(prod.get())
def backprop(self, input_data, df_output, cache=None): """ Backpropagate through the hidden layer **Parameters:** input_data : ``GPUArray`` Inpute data to compute activations for. df_output : ``GPUArray`` Gradients with respect to the activations of this layer (received from the layer above). cache : list of ``GPUArray`` Cache obtained from forward pass. If the cache is provided, then the activations are not recalculated. **Returns:** gradients : tuple of ``GPUArray`` Gradients with respect to the weights and biases in the form ``(df_weights, df_biases)``. df_input : ``GPUArray`` Gradients with respect to the input. """ # Get cache if it wasn't provided if cache is None: cache = self.feed_forward(input_data, prediction=False) if len(cache) == 2: activations, dropout_mask = cache else: activations = cache[0] # Multiply the binary mask with the incoming gradients if self.dropout and dropout_mask is not None: apply_dropout_mask(df_output, dropout_mask) # Get gradient wrt activation function df_activations = self.df(activations) delta = df_activations * df_output # Gradient wrt weights df_W = linalg.dot(input_data, delta, transa='T') # Gradient wrt bias df_b = matrix_sum_out_axis(delta, 0) # Gradient wrt inputs df_input = linalg.dot(delta, self.W, transb='T') # L1 weight decay if self.l1_penalty_weight: df_W -= self.l1_penalty_weight * sign(self.W) # L2 weight decay if self.l2_penalty_weight: df_W -= self.l2_penalty_weight * self.W return (df_W, df_b), df_input
def updateGradient(self, bp_signal, inputs, print_timing=False, include_prior=True): if print_timing: print '' t0 = t.time() t_run = t.time() if self.magic_numbers: back_prop = bp_signal * 0.6667 / 1.7159 * (1.7159 - (self.outputs) * (1.7159 + self.outputs)) else: back_prop = bp_signal * (1.0 - (self.outputs * self.outputs)) if print_timing: t_bp = t.time() - t_run t_run = t.time() self.gW = linalg.dot(inputs, back_prop, transa='T') if print_timing: t_dot = t.time() - t_run t_run = t.time() #self.gW = linalg.dot(linalg.transpose(inputs),back_prop) ones = gpuarray.to_gpu(np.ones((1, self.N)).astype(self.precision)) if print_timing: t_ones = t.time() - t_run t_run = t.time() self.gB = linalg.dot(ones, back_prop) if print_timing: t_biases = t.time() - t_run t_run = t.time() if include_prior: self.prior.updateWeightGradient(self.weights, self.gW) if print_timing: t_weights = t.time() - t_run t_run = t.time() self.prior.updateBiasGradient(self.biases, self.gB) if print_timing: t_prior = t.time() - t_run print 'Total time for gradient update in hidden layer ' + str( self.ID) + ' ' + str(t.time() - t0) print 'Time to calculate backprop in hidden layer ' + str( self.ID) + ' ' + str(t_bp) print 'Time to calculate gradient for weights in hidden layer ' + str( self.ID) + ' ' + str(t_dot) print 'Time to allocate ones vector in hidden layer ' + str( self.ID) + ' ' + str(t_ones) print 'Time to biases gradient in hidden layer ' + str( self.ID) + ' ' + str(t_biases) print 'Time for prior update in hidden layer ' + str( self.ID) + ' ' + str(t_prior) if self.ID > 0: return linalg.dot(back_prop, self.weights, transb='T') #return linalg.dot(back_prop,linalg.transpose(self.weights)) else: return -1
def eps_r(x, A1, A2, out, handle): out.fill(0) #tmp = garr.empty((A1[0].shape[0], x.shape[1]), dtype=A1[0].dtype) #tmp2 = garr.empty((tmp.shape[0], A2[0].shape[0]), dtype=A1[0].dtype) for s in range(len(A1)): tmp = cla.dot(A1[s], x, handle=handle) tmp2 = cla.dot(tmp, A2[s], transb='C', handle=handle) out += tmp2 return out
def forward(self, bottom, top): with pu.caffe_cuda_context(): h = caffe.cublas_handle() import scikits.cuda.linalg as linalg mat1 = bottom[0].data_as_pycuda_gpuarray() mat2 = bottom[1].data_as_pycuda_gpuarray() mato = top[0].data_as_pycuda_gpuarray() for b in xrange(bottom[0].shape[0]): linalg.dot(mat1[b], mat2[b], transa=blas_trans(self.t1_), transb=blas_trans(self.t2_), handle=h, out=mato[b])
def test_dot_vector_complex128(self): a = np.asarray(np.random.rand(5), np.complex128) b = np.asarray(np.random.rand(5), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c) a = a.astype(np.complex128, order="F", copy=True) b = b.astype(np.complex128, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c)
def decompose(self): gcov = cla.dot(self._Y_gpu, self._Y_gpu, transa='C') ge_g, gh_g = np.linalg.eigh(gcov.get()) I = np.argsort(ge_g)[::-1] ge_g, gh_g = np.sqrt(ge_g[I]), gh_g[:,I] # push the matrix back out gpueigs = gpuarray.to_gpu(gh_g) W_g = cla.dot(self._Y_gpu, gpueigs) # Unitize W_g - could be done on gpu to allow async returning W_g = W_g.get() W_g = W_g / np.sqrt(np.sum(W_g**2, axis=0))[np.newaxis, :] return W_g, ge_g, gh_g.T # Not sure whether the last one should be transposed
def test_dot_matrix_h_complex128(self): a = np.asarray(np.random.rand(2, 4) + 1j * np.random.rand(2, 4), np.complex128) b = np.asarray(np.random.rand(2, 2) + 1j * np.random.rand(2, 2), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, "c") assert np.allclose(np.dot(a.conj().T, b), c_gpu.get()) a = a.astype(np.complex128, order="F", copy=True) b = b.astype(np.complex128, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, "c") assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
def backprop(self, input_data, targets, cache=None): """ Backpropagate through the logistic layer. **Parameters:** input_data : ``GPUArray`` Inpute data to compute activations for. targets : ``GPUArray`` The target values of the units. cache : list of ``GPUArray`` Cache obtained from forward pass. If the cache is provided, then the activations are not recalculated. **Returns:** gradients : tuple of ``GPUArray`` Gradients with respect to the weights and biases in the form ``(df_weights, df_biases)``. df_input : ``GPUArray`` Gradients with respect to the input. """ if cache is not None: activations = cache else: activations = self.feed_forward(input_data, prediction=False) delta = activations - targets nan_to_zeros(delta, delta) # Gradient wrt weights df_W = linalg.dot(input_data, delta, transa='T') # Gradient wrt bias df_b = matrix_sum_out_axis(delta, 0) # Gradient wrt input df_input = linalg.dot(delta, self.W, transb='T') # L1 penalty if self.l1_penalty_weight: df_W -= self.l1_penalty_weight * sign(self.W) # L2 penalty if self.l2_penalty_weight: df_W -= self.l2_penalty_weight * self.W return (df_W, df_b), df_input
def backprop(self, input_data, df_output, cache=None): """ Backpropagate through the hidden layer Inputs: input_data df_output: the gradient wrt the output units cache (optional): cache object from the forward pass Output: df_W: gradient wrt the weights df_b: gradient wrt the bias df_input: gradient wrt the input """ # Get cache if it wasn't provided if cache is None: cache = self.feed_forward(input_data, prediction=False) if len(cache) == 2: activations, dropout_mask = cache else: activations = cache[0] # Multiply the binary mask with the incoming gradients if self.dropout and dropout_mask is not None: apply_dropout_mask(df_output, dropout_mask) # Get gradient wrt activation function df_activations = self.df(activations) delta = df_activations * df_output # Gradient wrt weights df_W = linalg.dot(input_data, delta, transa='T') # Gradient wrt bias df_b = matrix_sum_out_axis(delta, 0) # Gradient wrt inputs df_input = linalg.dot(delta, self.W, transb='T') # L1 weight decay if self.l1_penalty_weight: df_W -= self.l1_penalty_weight * sign(self.W) # L2 weight decay if self.l2_penalty_weight: df_W -= self.l2_penalty_weight * self.W return (df_W, df_b), df_input
def test_dot_matrix_h_complex128(self): a = np.asarray( np.random.rand(2, 4) + 1j * np.random.rand(2, 4), np.complex128) b = np.asarray( np.random.rand(2, 2) + 1j * np.random.rand(2, 2), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'c') assert np.allclose(np.dot(a.conj().T, b), c_gpu.get()) a = a.astype(np.complex128, order="F", copy=True) b = b.astype(np.complex128, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'c') assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
def test_dot_vector_float64(self): a = np.asarray(np.random.rand(5), np.float64) b = np.asarray(np.random.rand(5), np.float64) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c)
def test_dot_vector_complex128(self): a = np.asarray(np.random.rand(5), np.complex128) b = np.asarray(np.random.rand(5), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c)
def test_dot_matrix_float32(self): a = np.asarray(np.random.rand(4, 2), np.float32) b = np.asarray(np.random.rand(2, 2), np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu) assert np.allclose(np.dot(a, b), c_gpu.get())
def thunk(): x = inputs[0] y = inputs[1] # chop off the real/imag dimension input_shape_x = x[0].shape # (a, b, 2) input_shape_y = y[0].shape # (b, c, 2) output_shape = (input_shape_x[0], input_shape_y[1], 2) # (a, c, 2) input_x_pycuda = to_complex_gpuarray(x[0]) input_y_pycuda = to_complex_gpuarray(y[0]) # multistream experiment # print "DEBUG: Setting stream to %d" % current_stream[0] # prev_stream_obj = stream_pool[(current_stream[0] - 1) % num_streams] # print "PREV STREAM IS DONE?" # print prev_stream_obj.is_done() # print stream_obj = stream_pool[current_stream[0]] cublas.cublasSetStream(handle[0], stream_obj.handle) current_stream[0] += 1 current_stream[0] %= num_streams # print "DEBUG: set next stream id to %d" % current_stream[0] output_pycuda = linalg.dot(input_x_pycuda, input_y_pycuda, handle=handle[0]) outputs[0][0] = to_complex_cudandarray(output_pycuda)
def test_dot_matrix_t_complex128(self): a = np.asarray(np.random.rand(2, 4), np.complex128) b = np.asarray(np.random.rand(2, 2), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 't') assert np.allclose(np.dot(a.T, b), c_gpu.get())
def test_dot_matrix_h_complex128(self): a = np.asarray(np.random.rand(2, 4)+1j*np.random.rand(2, 4), np.complex128) b = np.asarray(np.random.rand(2, 2)+1j*np.random.rand(2, 2), np.complex128) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'c') assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
def test_dot_matrix_t_complex64(self): a = np.asarray(np.random.rand(2, 4), np.complex64) b = np.asarray(np.random.rand(2, 2), np.complex64) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 't') assert np.allclose(np.dot(a.T, b), c_gpu.get())
def feed_forward(self, input_data, prediction=False): """ Propagate forward through the hidden layer. Inputs: input_data -- input from the previous layer prediction -- (bool) whether predicting or training Outputs: lin_activations activations If self.dropout = True and prediction=False: Output: lin_activations activations dropout_mask: binary mask of dropped units """ activations = linalg.dot(input_data, self.W) activations = add_vec_to_mat(activations, self.b, inplace=True) self.f(activations) if self.dropout and prediction: activations *= .5 if self.dropout and not prediction: dropout_mask = sample_dropout_mask(activations) return activations, dropout_mask return (activations,)
def feed_forward(self, input_data, prediction=False): """Propagate forward through the layer **Parameters:** input_data : ``GPUArray`` Inpute data to compute activations for. prediction : bool, optional Whether to use prediction model. Only relevant when using dropout. If true, then weights are halved if the layers uses dropout. **Returns:** activations : ``GPUArray`` The activations of the hidden units. """ activations = linalg.dot(input_data, self.W) activations = add_vec_to_mat(activations, self.b, inplace=True) self.f(activations) if self.dropout and prediction: activations *= .5 if self.dropout and not prediction: dropout_mask = sample_dropout_mask(activations) return activations, dropout_mask return (activations,)
def updateGradient(self, Y, inputs, print_timing=False, include_prior=True): if print_timing: t0 = t.time() t_run = t.time() diff = Y - self.outputs if print_timing: t_diff = t.time() - t_run t_run = t.time() #self.gW = linalg.dot(linalg.transpose(inputs),diff) self.gW = linalg.dot(inputs, diff, transa='T') if print_timing: t_dot = t.time() - t_run t_run = t.time() ones = gpuarray.to_gpu(np.ones((1, self.N)).astype(self.precision)) if print_timing: t_ones = t.time() - t_run t_run = t.time() bias_diff = Y - self.outputs self.gB = linalg.dot(ones, bias_diff) if print_timing: t_sum_bias = t.time() - t_run t_run = t.time() if print_timing: t1 = t.time() t0_prior = t.time() if include_prior: self.prior.updateWeightGradient(self.weights, self.gW) if print_timing: t_weights = t.time() - t_run t_run = t.time() self.prior.updateBiasGradient(self.biases, self.gB) if print_timing: t1_prior = t.time() print 'Total time for gradient update in softmax layer ' + str(t1 - t0) print 'Time for prior update in softmax layer ' + str(t1_prior - t0_prior) print 'Time for Y-outputs ' + str(t_diff) print 'Time for inputs-diff dot-prod ' + str(t_dot) print 'Time to create ones vector ' + str(t_ones) print 'Time for diff-ones dot-prod ' + str(t_sum_bias) return linalg.dot(diff, self.weights, transb='T')
def cudasolve(self, A, b, tol=1e-4): """ Conjugate gradient solver for dense system of linear equations. Ax = b Returns: x = A^(-1)b """ N = len(b) b = b.reshape((N,1)) x = b.copy() # print 'A', A.shape # print 'b', b.shape # print 'x', x.shape r = b - culinalg.dot(A,x) # print 'r', r.shape p = r.copy() rsold = culinalg.dot(r,r, transa='T')[0][0].get() # print 'rsold', rsold for i in range(N): Ap = culinalg.dot(A,p) # print 'A', A.shape # print 'p', p.shape # print 'Ap', Ap.shape pAp = culinalg.dot(p, Ap, transa='T')[0][0].get() # print 'p^(T)Ap', pAp alpha = rsold / pAp # print 'alpha', alpha x += alpha*p # print 'x', x.shape r -= alpha*Ap rsnew = culinalg.dot(r,r, transa='T')[0][0].get() # print 'rsnew', math.sqrt(rsnew) if math.sqrt(rsnew) < tol: break else: p = r + (rsnew/rsold)*p rsold = rsnew print 'cudasolve> Iterations required on GPU:', i return x.reshape(N)
def cudasolve(self, A, b, tol=1e-4): """ Conjugate gradient solver for dense system of linear equations. Ax = b Returns: x = A^(-1)b """ N = len(b) b = b.reshape((N, 1)) x = b.copy() # print 'A', A.shape # print 'b', b.shape # print 'x', x.shape r = b - culinalg.dot(A, x) # print 'r', r.shape p = r.copy() rsold = culinalg.dot(r, r, transa='T')[0][0].get() # print 'rsold', rsold for i in range(N): Ap = culinalg.dot(A, p) # print 'A', A.shape # print 'p', p.shape # print 'Ap', Ap.shape pAp = culinalg.dot(p, Ap, transa='T')[0][0].get() # print 'p^(T)Ap', pAp alpha = rsold / pAp # print 'alpha', alpha x += alpha * p # print 'x', x.shape r -= alpha * Ap rsnew = culinalg.dot(r, r, transa='T')[0][0].get() # print 'rsnew', math.sqrt(rsnew) if math.sqrt(rsnew) < tol: break else: p = r + (rsnew / rsold) * p rsold = rsnew print 'cudasolve> Iterations required on GPU:', i return x.reshape(N)
def updateGradient(self,bp_signal,inputs,print_timing=False,include_prior=True): if print_timing: print '' t0 = t.time() t_run = t.time() if self.magic_numbers: back_prop = bp_signal* 0.6667/1.7159 * (1.7159 - (self.outputs)*(1.7159 + self.outputs)) else: back_prop = bp_signal*(1.0-(self.outputs*self.outputs)) if print_timing: t_bp = t.time() - t_run t_run = t.time() self.gW = linalg.dot(inputs,back_prop,transa='T') if print_timing: t_dot = t.time() - t_run t_run = t.time() #self.gW = linalg.dot(linalg.transpose(inputs),back_prop) ones = gpuarray.to_gpu(np.ones((1,self.N)).astype(self.precision)) if print_timing: t_ones = t.time() - t_run t_run = t.time() self.gB = linalg.dot(ones,back_prop) if print_timing: t_biases = t.time() - t_run t_run = t.time() if include_prior: self.prior.updateWeightGradient(self.weights,self.gW) if print_timing: t_weights = t.time() - t_run t_run = t.time() self.prior.updateBiasGradient(self.biases,self.gB) if print_timing: t_prior = t.time() - t_run print 'Total time for gradient update in hidden layer ' + str(self.ID) + ' ' + str(t.time()-t0) print 'Time to calculate backprop in hidden layer ' + str(self.ID) + ' ' + str(t_bp) print 'Time to calculate gradient for weights in hidden layer ' + str(self.ID) + ' ' + str(t_dot) print 'Time to allocate ones vector in hidden layer ' + str(self.ID) + ' ' + str(t_ones) print 'Time to biases gradient in hidden layer ' + str(self.ID) + ' ' + str(t_biases) print 'Time for prior update in hidden layer ' + str(self.ID) + ' ' + str(t_prior) if self.ID > 0: return linalg.dot(back_prop,self.weights,transb='T') #return linalg.dot(back_prop,linalg.transpose(self.weights)) else: return -1
def calc_x_G(Kp1, C, Cm1, rp1, lm2, Am1, A, Ap1, lm1_s, lm1_si, r_s, r_si, Vsh, handle=None): D = A[0].shape[1] Dm1 = A[0].shape[0] q = len(A) x = garr.zeros((Dm1, q * D - Dm1), dtype=A[0].dtype) x_part = garr.empty_like(x) x_subpart = garr.empty_like(A[0]) if not (C is None and Kp1 is None): assert (not C is None) and (not Kp1 is None) x_part.fill(0) for s in range(q): x_subpart = eps_r(rp1, C[s], Ap1, x_subpart, handle) #~1st line x_subpart += cla.dot(A[s], Kp1, handle=handle) #~3rd line x_part += cla.dot(cla.dot(x_subpart, r_si, handle=handle), Vsh[s], handle=handle) x += cla.dot(lm1_s, x_part, handle=handle) if not lm2 is None: x_part.fill(0) for s in range(q): #~2nd line x_subpart = eps_l(lm2, Am1, Cm1[s], x_subpart, handle) x_part += cla.dot(x_subpart, cla.dot(r_s, Vsh[s], handle=handle), handle=handle) x += cla.dot(lm1_si, x_part, handle=handle) return x
def test_dot_matrix_h_complex64(self): a = np.asarray( np.random.rand(2, 4) + 1j * np.random.rand(2, 4), np.complex64) b = np.asarray( np.random.rand(2, 2) + 1j * np.random.rand(2, 2), np.complex64) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, 'c') assert np.allclose(np.dot(a.conj().T, b), c_gpu.get())
def _dot_matrix_tests(self, dtype, transa, transb): a = np.asarray(np.random.rand(4, 2), dtype) if transa == 'n': b = np.asarray(np.random.rand(2, 2), dtype) else: b = np.asarray(np.random.rand(4, 4), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb) aa = a if transa == 'n' else a.T bb = b if transb == 'n' else b.T assert np.allclose(np.dot(aa, bb), c_gpu.get()) a = a.astype(dtype, order="F", copy=True) b = b.astype(dtype, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb) assert np.allclose(np.dot(aa, bb), c_gpu.get())
def _dot_matrix_tests(self, dtype, transa, transb): a = np.asarray(np.random.rand(4, 2), dtype) if transa == "n": b = np.asarray(np.random.rand(2, 2), dtype) else: b = np.asarray(np.random.rand(4, 4), dtype) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb) aa = a if transa == "n" else a.T bb = b if transb == "n" else b.T assert np.allclose(np.dot(aa, bb), c_gpu.get()) a = a.astype(dtype, order="F", copy=True) b = b.astype(dtype, order="F", copy=True) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = linalg.dot(a_gpu, b_gpu, transa, transb) assert np.allclose(np.dot(aa, bb), c_gpu.get())
def balance_matrix3_gpu(prob_nm, max_iter, row_priors, col_priors, outlierfrac, r_N=None): if not lfd.registration._has_cuda: raise NotImplementedError("CUDA not installed") n, m = prob_nm.shape prob_NM = np.empty((n + 1, m + 1), 'f4') prob_NM[:n, :m] = prob_nm prob_NM[:n, m] = row_priors prob_NM[n, :m] = col_priors prob_NM[n, m] = np.sqrt( np.sum(row_priors) * np.sum(col_priors) ) # this can `be weighted bigger weight = fewer outliers a_N = np.ones((n + 1), 'f4') a_N[n] = m * outlierfrac b_M = np.ones((m + 1), 'f4') b_M[m] = n * outlierfrac if r_N is None: r_N = np.ones((n + 1, 1), 'f4') prob_NM_gpu = gpuarray.empty((n + 1, m + 1), dtype=np.float32) prob_MN_gpu = gpuarray.empty((m + 1, n + 1), dtype=np.float32) r_N_gpu = gpuarray.empty((n + 1, 1), dtype=np.float32) c_M_gpu = gpuarray.empty((m + 1, 1), dtype=np.float32) prob_NM_gpu.set_async(prob_NM) prob_MN_gpu.set_async(prob_NM.T.copy()) r_N_gpu.set_async(r_N) for _ in xrange(max_iter): culinalg.dot(prob_NM_gpu, r_N_gpu, transa='T', out=c_M_gpu) c_M_gpu.set_async(b_M[:, None] / c_M_gpu.get()) culinalg.dot(prob_MN_gpu, c_M_gpu, transa='T', out=r_N_gpu) r_N_gpu.set_async(a_N[:, None] / r_N_gpu.get()) r_N = r_N_gpu.get() c_M = c_M_gpu.get() prob_NM *= r_N prob_NM *= c_M.T return prob_NM[:n, :m].astype(np.float64), r_N, c_M
def solve(self, wt_n, y_nd, bend_coef, f_res): if y_nd.shape[0] != self.n or y_nd.shape[1] != self.d: raise RuntimeError( "The dimensions of y_nd doesn't match the dimensions of x_nd") if not y_nd.flags.c_contiguous: raise RuntimeError("Expected y_nd to be c-contiguous but it isn't") self.sqrtWQN_gpu.set_async(np.sqrt(wt_n)[:, None] * self.QN) geam(self.NKN_gpu, self.NRN_gpu, self.lhs_gpu, alpha=bend_coef, beta=1) gemm(self.sqrtWQN_gpu, self.sqrtWQN_gpu, self.lhs_gpu, transa='T', alpha=1, beta=1) drv.memcpy_dtod_async(self.rhs_gpu.gpudata, self.NR_gpu.gpudata, self.rhs_gpu.nbytes) self.y_dnW_gpu.set_async( y_nd.T * wt_n) # use transpose so that it is f_contiguous gemm(self.QN_gpu, self.y_dnW_gpu, self.rhs_gpu, transa='T', transb='T', alpha=1, beta=1) if lfd.registration._has_cula: culinalg.cho_solve(self.lhs_gpu, self.rhs_gpu) z = self.rhs_gpu.get() culinalg.dot(self.N_gpu, self.rhs_gpu, out=self.theta_gpu) theta = self.theta_gpu.get() else: # if cula is not install perform the last two computations in the CPU z = np.linalg.solve(self.lhs_gpu.get(), self.rhs_gpu.get()) theta = self.N.dot(z) f_res.update(self.x_nd, y_nd, bend_coef, self.rot_coef, wt_n, theta, N=self.N, z=z)
def get_solver_mats(self, x_nd, rot_coef): n,d = x_nd.shape K_nn = tps.tps_kernel_matrix(x_nd) A = np.r_[np.zeros((d+1,d+1)), np.c_[np.ones((n,1)), x_nd]].T n_cnts = A.shape[0] _u,_s,_vh = np.linalg.svd(A.T) N = _u[:,n_cnts:].copy() NR = (N[1:1+d,:].T * rot_coef).copy() # so that it is c-contiguous N_gpu = gpuarray.to_gpu(N[1+d:,:]) K_gpu = gpuarray.to_gpu(K_nn) KN_gpu = culinalg.dot(K_gpu, N_gpu) QN = np.c_[np.ones((n, 1)), x_nd].dot(N[:1+d,:]) + KN_gpu.get() NKN_gpu = culinalg.dot(N_gpu, KN_gpu, transa='T') NKN = NKN_gpu.get() NRN = NR.dot(N[1:1+d,:]) return N, QN, NKN, NRN, NR, K_nn
def get_solver_mats(self, x_nd, rot_coef): n, d = x_nd.shape K_nn = tps.tps_kernel_matrix(x_nd) A = np.r_[np.zeros((d + 1, d + 1)), np.c_[np.ones((n, 1)), x_nd]].T n_cnts = A.shape[0] _u, _s, _vh = np.linalg.svd(A.T) N = _u[:, n_cnts:].copy() NR = (N[1:1 + d, :].T * rot_coef).copy() # so that it is c-contiguous N_gpu = gpuarray.to_gpu(N[1 + d:, :]) K_gpu = gpuarray.to_gpu(K_nn) KN_gpu = culinalg.dot(K_gpu, N_gpu) QN = np.c_[np.ones((n, 1)), x_nd].dot(N[:1 + d, :]) + KN_gpu.get() NKN_gpu = culinalg.dot(N_gpu, KN_gpu, transa='T') NKN = NKN_gpu.get() NRN = NR.dot(N[1:1 + d, :]) return N, QN, NKN, NRN, NR, K_nn
def x_dot_YT(self, x, Y): x_size = x.shape[0] byte = np.float32(0).nbytes x.strides = (x_size * byte, byte) x.shape = (1, x_size) result = culinalg.dot(x, Y, transa = 'N', transb = 'T') x.strides = (byte,) x.shape = (x_size,) return result
def dot(d_a, d_b, transa='N', transb='N', out=None): if out is None: if transa == 'T': out_x = d_a.shape[1] else: out_x = d_a.shape[0] if transb == 'T': out_y = d_b.shape[0] else: out_y = d_b.shape[1] out = gpuarray.empty((out_x, out_y), numpy.float32) return linalg.dot(d_a, d_b, transa=transa, transb=transb, handle=handle, out=out)
def solve(self, wt_n, y_nd, bend_coef, f_res): if y_nd.shape[0] != self.n or y_nd.shape[1] != self.d: raise RuntimeError("The dimensions of y_nd doesn't match the dimensions of x_nd") if not y_nd.flags.c_contiguous: raise RuntimeError("Expected y_nd to be c-contiguous but it isn't") self.sqrtWQN_gpu.set_async(np.sqrt(wt_n)[:,None] * self.QN) geam(self.NKN_gpu, self.NRN_gpu, self.lhs_gpu, alpha=bend_coef, beta=1) gemm(self.sqrtWQN_gpu, self.sqrtWQN_gpu, self.lhs_gpu, transa='T', alpha=1, beta=1) drv.memcpy_dtod_async(self.rhs_gpu.gpudata, self.NR_gpu.gpudata, self.rhs_gpu.nbytes) self.y_dnW_gpu.set_async(y_nd.T * wt_n) # use transpose so that it is f_contiguous gemm(self.QN_gpu, self.y_dnW_gpu, self.rhs_gpu, transa='T', transb='T', alpha=1, beta=1) if lfd.registration._has_cula: culinalg.cho_solve(self.lhs_gpu, self.rhs_gpu) culinalg.dot(self.N_gpu, self.rhs_gpu, out=self.theta_gpu) theta = self.theta_gpu.get() else: # if cula is not install perform the last two computations in the CPU z = np.linalg.solve(self.lhs_gpu.get(), self.rhs_gpu.get()) theta = self.N.dot(z) f_res.set_ThinPlateSpline(self.x_nd, y_nd, bend_coef, self.rot_coef, wt_n, theta=theta)
def forward(self, bottom, top): """ """ with pu.caffe_cuda_context(): h = caffe.cublas_handle() batch_size = bottom[0].shape[0] dim = bottom[0].count / bottom[0].shape[0] pred = bottom[0].data_as_pycuda_gpuarray() label = bottom[1].data_as_pycuda_gpuarray() mask = bottom[2].data_as_pycuda_gpuarray() # Use bottom[0,1].diff as temporary buffer diff = bottom[0].diff_as_pycuda_gpuarray() diff2 = bottom[1].diff_as_pycuda_gpuarray() # Compute diff self.k_masked_diff_(diff, pred, label, mask) self.k_squared_(diff, diff2) import scikits.cuda.linalg as linalg # This needs scikits.cuda 0.5.0a3 or later # (sudo) pip install scikits.cuda=>0.5.0a3 linalg.dot(diff.reshape(batch_size, dim), self.multipier_sum_, handle=h, out=self.diff_sum_) linalg.dot(diff2.reshape(batch_size, dim), self.multipier_sum_, handle=h, out=self.diff2_sum_) linalg.dot(mask.reshape(batch_size, dim), self.multipier_sum_, handle=h, out=self.mask_sum_) self.k_ensure_mask_sum_(self.mask_sum_) term1 = self.k_div_sum_(self.diff2_sum_, self.mask_sum_) term2 = self.k_div_squared_sum_(self.diff_sum_, self.mask_sum_) top[0].data[...] = (term1.get() - self.lambda_ * term2.get()) \ / batch_size
def updateGradient(self,Y,inputs,print_timing=False,include_prior=True): if print_timing: t0 = t.time() t_run = t.time() diff = Y-self.outputs if print_timing: t_diff = t.time() - t_run t_run = t.time() #self.gW = linalg.dot(linalg.transpose(inputs),diff) self.gW = linalg.dot(inputs,diff,transa='T') if print_timing: t_dot = t.time() - t_run t_run = t.time() ones = gpuarray.to_gpu(np.ones((1,self.N)).astype(self.precision)) if print_timing: t_ones = t.time() - t_run t_run = t.time() bias_diff = Y - self.outputs self.gB = linalg.dot(ones,bias_diff) if print_timing: t_sum_bias = t.time() - t_run t_run = t.time() if print_timing: t1 = t.time() t0_prior = t.time() if include_prior: self.prior.updateWeightGradient(self.weights,self.gW) if print_timing: t_weights = t.time() - t_run t_run = t.time() self.prior.updateBiasGradient(self.biases,self.gB) if print_timing: t1_prior = t.time() print 'Total time for gradient update in softmax layer ' + str(t1-t0) print 'Time for prior update in softmax layer ' + str(t1_prior - t0_prior) print 'Time for Y-outputs ' + str(t_diff) print 'Time for inputs-diff dot-prod ' + str(t_dot) print 'Time to create ones vector ' + str(t_ones) print 'Time for diff-ones dot-prod ' + str(t_sum_bias) return linalg.dot(diff,self.weights,transb='T')
def backprop(self, input_data, targets, cache=None): """ Backpropagate through the logistic layer Inputs: input_data targets get_df_input: (bool) whether to compute and return the gradient wrt the inputs return_cache: (bool) whether to return the cache cache: cache object from forward pass """ if cache is not None: activations = cache else: activations = self.feed_forward(input_data, prediction=False) delta = activations - targets nan_to_zeros(delta, delta) # Gradient wrt weights df_W = linalg.dot(input_data, delta, transa='T') # Gradient wrt bias df_b = matrix_sum_out_axis(delta, 0) # Gradient wrt input df_input = linalg.dot(delta, self.W, transb='T') # L1 penalty if self.l1_penalty_weight: df_W -= self.l1_penalty_weight * sign(self.W) # L2 penalty if self.l2_penalty_weight: df_W -= self.l2_penalty_weight * self.W return (df_W, df_b), df_input
def left_dot_col_major_gpu(m1, m2): out_gpu = gpuarray.GPUArray((m1.shape[0], m2.shape[1]), np.float64, allocator=cuda.mem_alloc, order='F') gpu_mem_avail = GPU_MAX_MEM - out_gpu.nbytes - m1.nbytes print("GPU MEM: " + str(GPU_MAX_MEM)) print("GPU MEM AVAIL: " + str(gpu_mem_avail)) print("M1 SIZE: " + str(m1.nbytes)) print("OUT SIZE: " + str(out_gpu.nbytes)) frags = 1 while ((m2.nbytes / frags) > gpu_mem_avail): frags += 1 m1_gpu = gpuarray.to_gpu(m1) subm2_gpu = None shift = 0 for subm2 in np.array_split(m2, frags, axis=1): if subm2_gpu is not None: del subm2_gpu subm2_gpu = gpuarray.to_gpu(subm2) linalg.dot(m1_gpu, subm2_gpu, out=out_gpu[:,shift:shift+subm2.shape[1]]) shift += subm2.shape[1] out = out_gpu.get() del m1_gpu del out_gpu return out
def f(mat, axis=0, cache_one_vector=True): assert mat.flags.c_contiguous N, M = mat.shape if axis == 0: vec_shape = (N, ) try: ones = one_vector_cache[vec_shape] except KeyError: ones = gpuarray.empty(vec_shape, dtype=mat.dtype).fill(1.) if cache_one_vector: one_vector_cache[vec_shape] = ones target = linalg.dot(ones, mat).ravel() elif axis == 1: vec_shape = (M, 1) try: ones = one_vector_cache[vec_shape] except KeyError: ones = gpuarray.empty((M, 1), dtype=mat.dtype).fill(1.) if cache_one_vector: one_vector_cache[vec_shape] = ones target = linalg.dot(mat, ones).ravel() else: raise ValueError('axis must be 0 or 1') return target
def thunk(): x = inputs[0] y = inputs[1] # chop off the real/imag dimension input_shape_x = x[0].shape # (a, b, 2) input_shape_y = y[0].shape # (b, c, 2) output_shape = (input_shape_x[0], input_shape_y[1], 2) # (a, c, 2) input_x_pycuda = to_complex_gpuarray(x[0]) input_y_pycuda = to_complex_gpuarray(y[0]) output_pycuda = linalg.dot(input_x_pycuda, input_y_pycuda) outputs[0][0] = to_complex_cudandarray(output_pycuda)
def train_rfn_gpu(X, n_hidden, n_iter, learnrateW, learnratePsi, dropout_rate, input_droput_rate, minPsi=0.1, seed=32): k = n_hidden n, m = X.shape W = np.random.normal(scale=0.01, size=(k, m)).astype(np.float32) P = np.array([0.1] * m, dtype=np.float32) XXdiag = np.diag(np.dot(X.T, X) / n).copy() # explicit copy to avoid numpy 1.8 warning W = gpu.to_gpu(W, allocator=_mempool.allocate) P = gpu.to_gpu(P, allocator=_mempool.allocate) X = gpu.to_gpu(X, allocator=_mempool.allocate) XXdiag = gpu.to_gpu(XXdiag, allocator=_mempool.allocate) I = la.eye(k, dtype=np.float32) init_rng(seed) t0 = time.time() for cur_iter in range(n_iter): H, tmp = calculate_H_gpu(X, W, P) if dropout_rate > 0: dropout(H, dropout_rate) Xtmp = X if input_dropout_rate > 0: Xtmp = X.copy() saltpepper_noise(Xtmp, input_dropout_rate) U = la.dot(Xtmp, H, "t", "n") / n S = la.dot(H, H, "t", "n") / n S += I S -= la.dot(tmp, W, "n", "t") Cii = la.dot(la.dot(W, S, "t") - 2 * U, W) Sinv = la.inv(S, overwrite=True) dW = la.dot(Sinv, U, "n", "t") - W dP = XXdiag + la.diag(Cii) - P W += learnrateW * dW P += learnratePsi * dP P = gpu.maximum(P, minPsi) if cur_iter % 25 == 0: print "iter %3d (elapsed time: %5.2fs)" % (cur_iter, time.time() - t0) return W.get(), P.get()
def Test(): A = np.float32(np.random.randn(*(2000, 2000))) A = np.complex64(np.ones((2000, 2000)) + 1j * np.ones((2000, 2000))) AT = A.T.copy() A_32 = A #np.float32(A) AT_32 = AT #np.float32(AT) T = ClassTimeIt.ClassTimeIt() # create two random matrices and copy them to the GPU g_A0 = cm.CUDAMatrix(A) g_AT0 = cm.CUDAMatrix(AT) # perform calculations on the GPU P0 = cm.dot(g_AT0, g_A0).asarray() #d = cm.sum(axis = 0) T.timeit("GPU0") del (g_AT0, g_A0) #T.reinit() # copy d back to the host (CPU) and print g_A1 = gpuarray.to_gpu(A) g_AT1 = gpuarray.to_gpu(AT) #time.sleep(5) #T.timeit("tranf0") g_P1 = culinalg.dot(g_AT1, g_A1) P1 = g_P1.get() #T.timeit("tranf1") T.timeit("GPU1") np_P = np.dot(AT, A) T.timeit("np") #print g_P-np_P print(np.max(np_P - P0)) print(np.max(np_P - P1))
def feed_forward(self, input_data, prediction=False): """Propagate forward through the layer. **Parameters:** input_data : ``GPUArray`` Inpute data to compute activations for. prediction : bool, optional Whether to use prediction model. Only relevant when using dropout. If true, then weights are halved if the layers uses dropout. **Returns:** activations : ``GPUArray`` The activations of the output units. """ activations = linalg.dot(input_data, self.W) activations = add_vec_to_mat(activations, self.b, inplace=True) activations = softmax(activations) return activations
def forward(self, bottom, top): """ """ with pu.caffe_cuda_context(): h = caffe.cublas_handle() batch_size = bottom[0].shape[0] dim = bottom[0].count / bottom[0].shape[0] pred = bottom[0].data_as_pycuda_gpuarray() label = bottom[1].data_as_pycuda_gpuarray() # Use bottom[0,1].diff as temporary buffer diff = bottom[0].diff_as_pycuda_gpuarray() diff2 = bottom[1].diff_as_pycuda_gpuarray() mask = bottom[0].diff_as_pycuda_gpuarray() # Compute diff self.k_masked_diff_(diff, pred, label) self.k_squared_(diff, diff2) import scikits.cuda.linalg as linalg # This needs scikits.cuda 0.5.0a3 or later # (sudo) pip install scikits.cuda=>0.5.0a3 linalg.dot(diff.reshape(batch_size, dim), self.multipier_sum_, handle=h, out=self.diff_sum_) linalg.dot(diff2.reshape(batch_size, dim), self.multipier_sum_, handle=h, out=self.diff2_sum_) mask.fill(dtype(1.0)) linalg.dot(mask.reshape(batch_size, dim), self.multipier_sum_, handle=h, out=self.mask_sum_) self.k_ensure_mask_sum_(self.mask_sum_) term1 = self.k_div_sum_(self.diff2_sum_, self.mask_sum_) term2 = self.k_div_squared_sum_(self.diff_sum_, self.mask_sum_) top[0].data[...] = (term1.get() - self.lambda_ * term2.get()) \ / batch_size