コード例 #1
0
ファイル: fastnet.py プロジェクト: iskandr/striate
  def prepare_for_train(data, label):
    assert len(data.shape) == 4
    if data.shape[3] != self.batchSize:
      self.batchSize = data.shape[3]
      for l in self.layers:
        l.change_batch_size(self.batchSize)
      self.inputShapes = None
      self.imgShapes = None
      self.outputs = []
      self.grads = []
      self.local_outputs = []
      self.local_grads = []


      self.imgShapes = [(self.numColor, self.imgSize / 2, self.imgSize / 2, self.batchSize)]
      self.inputShapes = [(self.numColr * (self.imgSize ** 2) / 4, self.batchSize)]

      fc = False
      for layer in self.layers:
        outputShape = layer.get_output_shape()

        row = outputShape[0] * outputShape[1] * outputShape[2]
        col = outputShape[3]

        if layer.type == 'softmax':
          row *= comm.Get_size()
          outputShape = (outputShape[0] * comm.Get_size(), 1, 1, outputShape[3])

        self.inputShapes.append((row, col))
        self.imgShapes.append(outputShape)

        area = make_area(outputShape)
        self.outputs.append(virtual_array(rank, area = area))
        self.local_outputs.append(gpuarray.zeros((row, col), dtype =np.float32))

        inputShape = self.inputShapes[-2]
        #if layer.type == 'fc':
        #  inputShape = (inputShape[0] * comm.Get_size(), inputShape[1])
        #  self.local_grads.append(gpuarray.zeors(inputShape, dtype = np.float32))
        #  area = make_plain_area(inputShape)
        #else:
        #  self.local_grads.append(gpuarray.zeros(inputShape, dtype= np.float32))
        #  area = make_area(self.imgShapes[-2])
        #self.grads.append(virtual_array(rank, area = area))

      area = make_area((self.numColor, self.imgSize / 2, self.imgSize / 2, self.batchSize))
      self.data = virtual_array(rank, local = gpuarray.to_gpu(data.__getitem__(area.to_slice())),
          area = area)

      if not isinstance(label, GPUArray):
        self.label = gpuarray.to_gpu(label).astype(np.float32)
      else:
        self.label = label

      self.label = self.label.reshape((label.size, 1))
      self.numCase += data.shape[1]
      outputShape = self.inputShapes[-1]

      if self.output is None or self.output.shape != outputShape:
        self.output = gpuarray.zeros(outputShape, dtype = np.float32)
コード例 #2
0
def riemanntheta_high_dim(X, Yinv, T, z, g, rad, max_points = 10000000):
    parRiemann = RiemannThetaCuda(1,512)
    #initialize parRiemann
    parRiemann.compile(g)
    parRiemann.cache_omega_real(X)
    parRiemann.cache_omega_imag(Yinv,T)
    #compile the box_points program
    point_finder = func1()
    R = get_rad(T, rad)
    print R
    num_int_points = (2*R + 1)**g
    num_partitions = num_int_points//max_points
    num_final_partition = num_int_points - num_partitions*max_points
    osc_part = 0 + 0*1.j
    if (num_partitions > 0):
        S = gpuarray.zeros(np.int(max_points * g), dtype=np.double)
    print "Required number of iterations"
    print num_partitions
    print 
    for p in range(num_partitions):
        print p
        print
        S = box_points(point_finder, max_points*p, max_points*(p+1),g,R, S)
        parRiemann.cache_intpoints(S, gpu_already=True)
        osc_part += parRiemann.compute_v_without_derivs(np.array([z]))
    S = gpuarray.zeros(np.int((num_int_points - num_partitions*max_points)*g), dtype = np.double)
    print num_partitions*max_points,num_int_points
    S = box_points(point_finder, num_partitions*max_points, num_int_points, g, R,S)
    parRiemann.cache_intpoints(S,gpu_already = True)
    osc_part += parRiemann.compute_v_without_derivs(np.array([z]))
    print osc_part
    return osc_part
コード例 #3
0
 def compute_v_without_derivs(self, Xs, Yinvs, Ts):
     #Turn the parts of omega into gpuarrays
     Xs = np.require(Xs, dtype = np.double, requirements=['A', 'W', 'O', 'C'])
     Yinvs = np.require(Yinvs, dtype = np.double, requirements=['A', 'W', 'O', 'C'])
     Ts = np.require(Ts, dtype = np.double, requirements=['A', 'W', 'O', 'C'])
     Xs_d = gpuarray.to_gpu(Xs)
     Yinvs_d = gpuarray.to_gpu(Yinvs)
     Ts_d = gpuarray.to_gpu(Ts)
     #Determine N = the number of integer points to sum over
     #          K = the number of different omegas to compute the function at
     N = self.Sd.size/self.g
     K = Xs.size/(self.g**2)
     #Create room on the gpu for the real and imaginary finite sum calculations
     fsum_reald = gpuarray.zeros(N*K, dtype=np.double)
     fsum_imagd = gpuarray.zeros(N*K, dtype=np.double)
     #Turn all scalars into numpy data types
     Nd = np.int32(N)
     Kd = np.int32(K)
     gd = np.int32(self.g)
     blocksize = (self.tilewidth, self.tileheight, 1)
     gridsize = (N//self.tilewidth + 1, K//self.tileheight + 1, 1)
     self.finite_sum_without_derivs(fsum_reald, fsum_imagd, Xs_d, Yinvs_d, Ts_d,
                                    self.Sd, gd, Nd, Kd,
                                    block = blocksize,
                                    grid = gridsize)
     cuda.Context.synchronize()
     fsums_real = self.sum_reduction(fsum_reald, N, K, Kd, Nd)
     fsums_imag = self.sum_reduction(fsum_imagd, N, K, Kd, Nd)
     return fsums_real + 1.0j*fsums_imag
コード例 #4
0
ファイル: data.py プロジェクト: tesatory/fastnet
  def get_next_batch(self, batch_size):
    if self._reader is None:
      self._start_read()

    if self._gpu_batch is None:
      self._fill_reserved_data()

    height, width = self._gpu_batch.data.shape
    gpu_data = self._gpu_batch.data
    gpu_labels = self._gpu_batch.labels

    if self.index + batch_size >=  width:
      width = width - self.index
      labels = gpu_labels[self.index:self.index + batch_size]

      #data = gpu_data[:, self.index:self.index + batch_size]
      data = gpuarray.zeros((height, width), dtype = np.float32)
      gpu_partial_copy_to(gpu_data, data, 0, height, self.index, self.index + width)

      self.index = 0
      self._fill_reserved_data()
    else:
      labels = gpu_labels[self.index:self.index + batch_size]
      #data = gpu_data[:, self.index:self.index + batch_size]
      data = gpuarray.zeros((height, batch_size), dtype = np.float32)
      gpu_partial_copy_to(gpu_data, data, 0, height, self.index, self.index + batch_size)
      #labels = gpu_labels[self.index:self.index + batch_size]
      self.index += batch_size
    return BatchData(data, labels, self._gpu_batch.epoch)
コード例 #5
0
ファイル: LPU.py プロジェクト: LuisMoralesAlonso/neurokernel
    def _initialize_gpu_ds(self):
        """
        Setup GPU arrays.
        """

        self.synapse_state = garray.zeros(int(self.total_synapses) + \
                                    len(self.input_neuron_list), np.float64)
        if self.my_num_gpot_neurons>0:
            self.V = garray.zeros(int(self.my_num_gpot_neurons), np.float64)
        else:
            self.V = None

        if self.my_num_spike_neurons>0:
            self.spike_state = garray.zeros(int(self.my_num_spike_neurons), np.int32)

        if len(self.public_gpot_list)>0:
            self.public_gpot_list_g = garray.to_gpu(self.public_gpot_list)
            self.projection_gpot = garray.zeros(len(self.public_gpot_list), np.double)
            self._extract_gpot = self._extract_projection_gpot_func()

        if len(self.public_spike_list)>0:
            self.public_spike_list_g = garray.to_gpu( \
                (self.public_spike_list-self.spike_shift).astype(np.int32))
            self.projection_spike = garray.zeros(len(self.public_spike_list), np.int32)
            self._extract_spike = self._extract_projection_spike_func()
コード例 #6
0
ファイル: batchtps.py プロジェクト: rll/lfd
    def update_ptrs(self):
        self.tps_param_ptrs = get_gpu_ptrs(self.tps_params)
        self.trans_d_ptrs = get_gpu_ptrs(self.trans_d)
        self.lin_dd_ptrs = get_gpu_ptrs(self.lin_dd)
        self.w_nd_ptrs = get_gpu_ptrs(self.w_nd)

        for b in self.bend_coefs:
            self.proj_mat_ptrs[b] = get_gpu_ptrs(self.proj_mats[b])
            self.offset_mat_ptrs[b] = get_gpu_ptrs(self.offset_mats[b])

        self.pt_ptrs = get_gpu_ptrs(self.pts)
        self.kernel_ptrs = get_gpu_ptrs(self.kernels)
        self.pt_w_ptrs = get_gpu_ptrs(self.pts_w)
        self.pt_t_ptrs = get_gpu_ptrs(self.pts_t)
        self.corr_cm_ptrs = get_gpu_ptrs(self.corr_cm)
        self.corr_rm_ptrs = get_gpu_ptrs(self.corr_rm)
        self.r_coef_ptrs = get_gpu_ptrs(self.r_coefs)
        self.c_coef_rn_ptrs = get_gpu_ptrs(self.c_coefs_rn)
        self.c_coef_cn_ptrs = get_gpu_ptrs(self.c_coefs_cn)
        # temporary space for warping cost computations
        self.warp_err = gpuarray.zeros((self.N, MAX_CLD_SIZE), np.float32)
        self.bend_res_mat = gpuarray.zeros((DATA_DIM * self.N, DATA_DIM), np.float32)
        self.bend_res = [self.bend_res_mat[i * DATA_DIM : (i + 1) * DATA_DIM] for i in range(self.N)]
        self.bend_res_ptrs = get_gpu_ptrs(self.bend_res)

        self.dims_gpu = gpuarray.to_gpu(np.array(self.dims, dtype=np.int32))
        self.ptrs_valid = True
コード例 #7
0
 def compute_v_without_derivs(self, Z):
     #Turn the numpy set Z into gpuarrays
     x = Z.real
     y = Z.imag
     x = np.require(x, dtype = np.double, requirements=['A','W','O','C'])
     y = np.require(y, dtype = np.double, requirements=['A','W','O','C'])
     xd = gpuarray.to_gpu(x)
     yd = gpuarray.to_gpu(y)
     self.yd = yd
     #Detemine N = the number of integer points to sum over and
     #         K = the number of values to compute the function at
     N = self.Sd.size/self.g
     K = Z.size/self.g
     #Create room on the gpu for the real and imaginary finite sum calculations
     fsum_reald = gpuarray.zeros(N*K, dtype=np.double)
     fsum_imagd = gpuarray.zeros(N*K, dtype=np.double)
     #Make all scalars into numpy data types
     Nd = np.int32(N)
     Kd = np.int32(K)
     gd = np.int32(self.g)
     blocksize = (self.tilewidth, self.tileheight, 1)
     gridsize = (N//self.tilewidth + 1, K//self.tileheight + 1, 1)
     self.finite_sum_without_derivs(fsum_reald, fsum_imagd, xd, yd, 
                  self.Sd, gd, Nd, Kd,
                  block = blocksize,
                  grid = gridsize)
     cuda.Context.synchronize()
     fsums_real = self.sum_reduction(fsum_reald, N, K, Kd, Nd)
     fsums_imag = self.sum_reduction(fsum_imagd, N, K, Kd, Nd)
     return fsums_real + 1.0j*fsums_imag
コード例 #8
0
ファイル: layer.py プロジェクト: phecy/striate
 def logreg_cost(self, label, output):
   if self.cost.shape[0] !=  self.batchSize:
     self.cost = gpuarray.zeros((self.batchSize, 1), dtype=np.float32)
   maxid = gpuarray.zeros((self.batchSize, 1), dtype=np.float32)
   find_col_max_id(maxid, output)
   self.batchCorrect = same_reduce(label , maxid)
   logreg_cost_col_reduce(output, label, self.cost)
コード例 #9
0
ファイル: LPU.py プロジェクト: yiyin/neurokernel
    def _initialize_gpu_ds(self):
        """
        Setup GPU arrays.
        """

        self.synapse_state = garray.zeros(max(int(self.total_synapses) + len(self.input_neuron_list), 1), np.float64)

        if self.total_num_gpot_neurons > 0:
            self.V = garray.zeros(int(self.total_num_gpot_neurons), np.float64)
        else:
            self.V = None

        if self.total_num_spike_neurons > 0:
            self.spike_state = garray.zeros(int(self.total_num_spike_neurons), np.int32)

        self.block_extract = (256, 1, 1)
        if len(self.out_ports_ids_gpot) > 0:
            self.out_ports_ids_gpot_g = garray.to_gpu(self.out_ports_ids_gpot)
            self.sel_out_gpot_ids_g = garray.to_gpu(self.sel_out_gpot_ids)

            self._extract_gpot = self._extract_projection_gpot_func()

        if len(self.out_ports_ids_spk) > 0:
            self.out_ports_ids_spk_g = garray.to_gpu((self.out_ports_ids_spk - self.spike_shift).astype(np.int32))
            self.sel_out_spk_ids_g = garray.to_gpu(self.sel_out_spk_ids)

            self._extract_spike = self._extract_projection_spike_func()

        if self.ports_in_gpot_mem_ind is not None:
            inds = self.sel_in_gpot_ids
            self.inds_gpot = garray.to_gpu(inds)

        if self.ports_in_spk_mem_ind is not None:
            inds = self.sel_in_spk_ids
            self.inds_spike = garray.to_gpu(inds)
コード例 #10
0
ファイル: AlphaSynapsePre.py プロジェクト: yiyin/neurokernel
    def __init__( self, s_dict, synapse_state, dt, debug=False):
        self.debug = debug
        self.dt = dt
        self.num = len( s_dict['id'] )

        self.pre  = garray.to_gpu( np.asarray( s_dict['pre'], dtype=np.int32 ))
        self.ar   = garray.to_gpu( np.asarray( s_dict['ar'], dtype=np.float64 ))
        self.ad   = garray.to_gpu( np.asarray( s_dict['ad'], dtype=np.float64 ))
        self.gmax = garray.to_gpu( np.asarray( s_dict['gmax'], dtype=np.float64 ))
        self.a0   = garray.zeros( (self.num,), dtype=np.float64 )
        self.a1   = garray.zeros( (self.num,), dtype=np.float64 )
        self.a2   = garray.zeros( (self.num,), dtype=np.float64 )
        self.cond = synapse_state

        _num_dendrite_cond = np.asarray(
            [s_dict['num_dendrites_cond'][i] for i in s_dict['id']],\
            dtype=np.int32).flatten()
        _num_dendrite = np.asarray(
            [s_dict['num_dendrites_I'][i] for i in s_dict['id']],\
            dtype=np.int32).flatten()

        self._cum_num_dendrite = garray.to_gpu(_0_cumsum(_num_dendrite))
        self._cum_num_dendrite_cond = garray.to_gpu(_0_cumsum(_num_dendrite_cond))
        self._num_dendrite = garray.to_gpu(_num_dendrite)
        self._num_dendrite_cond = garray.to_gpu(_num_dendrite_cond)
        self._pre = garray.to_gpu(np.asarray(s_dict['I_pre'], dtype=np.int32))
        self._cond_pre = garray.to_gpu(np.asarray(s_dict['cond_pre'], dtype=np.int32))
        self._V_rev = garray.to_gpu(np.asarray(s_dict['reverse'],dtype=np.double))
        self.I = garray.zeros(self.num, np.double)
        #self._update_I_cond = self._get_update_I_cond_func()
        self._update_I_non_cond = self._get_update_I_non_cond_func()
        self.update = self._get_gpu_kernel()
コード例 #11
0
ファイル: pdf.py プロジェクト: BenLand100/chroma
    def setup_pdf_eval(self, event_hit, event_time, event_charge, min_twidth,
                       trange, min_qwidth, qrange, min_bin_content=10,
                       time_only=True):
        """Setup GPU arrays to compute PDF values for the given event.
        The pdf_eval calculation allows the PDF to be evaluated at a
        single point for each channel as the Monte Carlo is run.  The
        effective bin size will be as small as (`min_twidth`,
        `min_qwidth`) around the point of interest, but will be large
        enough to ensure that `min_bin_content` Monte Carlo events
        fall into the bin.

            event_hit: ndarray
              Hit or not-hit status for each channel in the detector.
            event_time: ndarray
              Hit time for each channel in the detector.  If channel 
              not hit, the time will be ignored.
            event_charge: ndarray
              Integrated charge for each channel in the detector.
              If channel not hit, the charge will be ignored.

            min_twidth: float
              Minimum bin size in the time dimension
            trange: (float, float)
              Range of time dimension in PDF
            min_qwidth: float
              Minimum bin size in charge dimension
            qrange: (float, float)
              Range of charge dimension in PDF
            min_bin_content: int
              The bin will be expanded to include at least this many events
            time_only: bool
              If True, only the time observable will be used in the PDF.
        """
        self.event_nhit = count_nonzero(event_hit)
        
        # Define a mapping from an array of len(event_hit) to an array of length event_nhit
        self.map_hit_offset_to_channel_id = np.where(event_hit)[0].astype(np.uint32)
        self.map_hit_offset_to_channel_id_gpu = ga.to_gpu(self.map_hit_offset_to_channel_id)
        self.map_channel_id_to_hit_offset = np.maximum(0, event_hit.cumsum() - 1).astype(np.uint32)
        self.map_channel_id_to_hit_offset_gpu = ga.to_gpu(self.map_channel_id_to_hit_offset)

        self.event_hit_gpu = ga.to_gpu(event_hit.astype(np.uint32))
        self.event_time_gpu = ga.to_gpu(event_time.astype(np.float32))
        self.event_charge_gpu = ga.to_gpu(event_charge.astype(np.float32))

        self.eval_hitcount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
        self.eval_bincount_gpu = ga.zeros(len(event_hit), dtype=np.uint32)
        self.nearest_mc_gpu = ga.empty(shape=self.event_nhit * min_bin_content, 
                                             dtype=np.float32)
        self.nearest_mc_gpu.fill(1e9)
        
        self.min_twidth = min_twidth
        self.trange = trange
        self.min_qwidth = min_qwidth
        self.qrange = qrange
        self.min_bin_content = min_bin_content

        assert time_only # Only support time right now
        self.time_only = time_only
コード例 #12
0
ファイル: layer.py プロジェクト: smessing/striate
 def fprop(self, input, output):
   max = gpuarray.zeros((1, self.batchSize), dtype = np.float32)
   col_max_reduce(max, input)
   add_vec_to_cols(input, max, output, alpha = -1)
   gpu_copy_to(cumath.exp(output), output)
   sum = gpuarray.zeros(max.shape, dtype = np.float32)
   add_col_sum_to_vec(sum, output, alpha = 0)
   div_vec_to_cols(output, sum)
コード例 #13
0
    def createHashTable(kd, vd, capacity):
        table_capacity_gpu, _ = mod.get_global('table_capacity')
        cuda.memcpy_htod(table_capacity_gpu, np.uint([capacity]))

        # CUDA_SAFE_CALL(cudaMemcpyToSymbol(table_capacity,
        #           &capacity,
        #           sizeof(unsigned int)));

        table_vals_gpu, table_vals_size = mod.get_global('table_values') # pointer-2-pointer
        values_gpu = gpuarray.zeros((capacity*vd,1), dtype=np.float32)
        # values_gpu = gpuarray.zeros((capacity*vd,1), dtype=np.float32)
        # cuda.memset_d32(values_gpu.gpudata, 0, values_gpu.size)
        cuda.memcpy_dtod(table_vals_gpu, values_gpu.gpudata, table_vals_size)

        # float *values;
        # allocateCudaMemory((void**)&values, capacity*vd*sizeof(float));
        # CUDA_SAFE_CALL(cudaMemset((void *)values, 0, capacity*vd*sizeof(float)));
        # CUDA_SAFE_CALL(cudaMemcpyToSymbol(table_values,
        #                   &values,
        #                   sizeof(float *)));

        table_entries, table_entries_size = mod.get_global('table_entries')
        entries_gpu = gpuarray.empty((capacity*2,1), dtype=np.int)
        entries_gpu.fill(-1)
        # cuda.memset_d32(entries_gpu.gpudata, 1, entries_gpu.size)
        cuda.memcpy_dtod(table_entries, entries_gpu.gpudata, table_entries_size)

        # int *entries;
        # allocateCudaMemory((void **)&entries, capacity*2*sizeof(int));
        # CUDA_SAFE_CALL(cudaMemset((void *)entries, -1, capacity*2*sizeof(int)));
        # CUDA_SAFE_CALL(cudaMemcpyToSymbol(table_entries,
        #                   &entries,
        #                   sizeof(unsigned int *)));

        ########################################
        # Assuming LINEAR_D_MEMORY not defined #
        ########################################

        #  #ifdef LINEAR_D_MEMORY
        # char *ranks;
        # allocateCudaMemory((void**)&ranks, capacity*sizeof(char));
        # CUDA_SAFE_CALL(cudaMemcpyToSymbol(table_rank,
        #                   &ranks,
        #                   sizeof(char *)));
        #
        # signed short *zeros;
        # allocateCudaMemory((void**)&zeros, capacity*sizeof(signed short));
        # CUDA_SAFE_CALL(cudaMemcpyToSymbol(table_zeros,
        #                   &zeros,
        #                   sizeof(char *)));
        #
        # #else

        table_keys_gpu, table_keys_size = mod.get_global('table_keys')
        keys_gpu = gpuarray.zeros((capacity*kd,1), dtype=np.short)
        # keys_gpu = gpuarray.empty((capacity*kd,1), dtype=np.short)
        # cuda.memset_d32(keys_gpu.gpudata, 0, keys_gpu.size)
        cuda.memcpy_dtod(table_keys_gpu, keys_gpu.gpudata, table_keys_size)
コード例 #14
0
ファイル: layer.py プロジェクト: alemagnani/fastnet
 def logreg_cost_multiview(self, label, output, num_view):
   unit = self.batch_size / num_view
   if self.cost.shape[0] != unit:
     self.cost = gpuarray.zeros((unit, 1), dtype = np.float32)
   maxid = gpuarray.zeros((self.batch_size, 1), dtype = np.float32)
   find_col_max_id(maxid, output)
   self.batchCorrect = same_reduce_multiview(label, maxid, num_view)
   tmp = gpuarray.zeros((output.shape[0], unit), dtype = np.float32)
   gpu_partial_copy_to(output, tmp, 0, output.shape[0], 0, unit)
   logreg_cost_col_reduce(tmp, label, self.cost)
コード例 #15
0
ファイル: layer.py プロジェクト: phecy/striate
 def fprop(self, input, output, train=TRAIN):
   max = gpuarray.zeros((1, self.batchSize), dtype=np.float32)
   col_max_reduce(max, input)
   add_vec_to_cols(input, max, output, alpha= -1)
   eltwise_exp(output)
   sum = gpuarray.zeros(max.shape, dtype=np.float32)
   add_col_sum_to_vec(sum, output, alpha=0)
   div_vec_to_cols(output, sum)
   if PFout:
     print_matrix(output, self.name)
コード例 #16
0
ファイル: cuda_alternatives.py プロジェクト: amilsted/evoMPS
    def __init__(self, A1, A2, left, use_batch=False):
        """Creates a new LinearOperator interface to the superoperator E.
        
        This is a wrapper to be used with SciPy's sparse linear algebra routines.
        
        Parameters
        ----------
        A1 : ndarray
            Ket parameter tensor. 
        A2 : ndarray
            Bra parameter tensor.
        left : bool
            Whether to multiply with a vector to the left (or to the right).
        """
        self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1]
        self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2]
        self.tmp = list(map(garr.empty_like, self.A1G[0]))
        self.tmp2 = list(map(garr.empty_like, self.A1G[0]))
        
        self.use_batch = use_batch
        self.left = left
        
        self.D = A1[0].shape[1]        
        self.shape = (self.D**2, self.D**2)        
        self.dtype = sp.dtype(A1[0][0].dtype)
        
        self.calls = 0        
        
        self.out = garr.empty((self.D, self.D), dtype=self.dtype)        
        self.xG = garr.empty((self.D, self.D), dtype=self.dtype)

        if use_batch:
            self.A1G_p = list(map(get_batch_ptrs, self.A1G))
            self.A2G_p = list(map(get_batch_ptrs, self.A2G))
            self.tmp_p = get_batch_ptrs(self.tmp)
            self.tmp2_p = get_batch_ptrs(self.tmp2)
            self.xG_p = get_batch_ptrs([self.xG] * len(A1[0]))
            self.out_p = get_batch_ptrs([self.out] * len(A1[0]))
        else:
            self.A1G_p = None
            self.A2G_p = None
            self.tmp_p = None
            self.tmp2_p = None
            self.xG_p = None
            self.out_p = None

            self.ones = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))]
            self.ones = [one.fill(1) for one in self.ones]
            self.zeros = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))]
            
            self.streams = []
            for s in range(A1[0].shape[0]):
                self.streams.append(cd.Stream())
        
        self.hdl = cb.cublasCreate()
コード例 #17
0
ファイル: data.py プロジェクト: rjpower/fastnet
    def get_next_batch(self, batch_size):
      if self._reader is None:
        self._start_read()

      if self._gpu_batch is None:
        self._fill_reserved_data()

      if not self.multiview:
        height, width = self._gpu_batch.data.shape
        gpu_data = self._gpu_batch.data
        gpu_labels = self._gpu_batch.labels
        epoch = self._gpu_batch.epoch

        if self.index + batch_size >=  width:
          width = width - self.index
          labels = gpu_labels[self.index:self.index + batch_size]

          data = gpuarray.zeros((height, width), dtype = np.float32)
          gpu_partial_copy_to(gpu_data, data, 0, height, self.index, self.index + width)
          self.index = 0
          self._fill_reserved_data()
        else:
          labels = gpu_labels[self.index:self.index + batch_size]
          data = gpuarray.zeros((height, batch_size), dtype = np.float32)
          gpu_partial_copy_to(gpu_data, data, 0, height, self.index, self.index + batch_size)
          self.index += batch_size
      else:
        # multiview provider
        # number of views should be 10
        # when using multiview, do not pre-move data and labels to gpu
        height, width = self._cpu_batch.data.shape
        cpu_data = self._cpu_batch.data
        cpu_labels = self._cpu_batch.labels
        epoch = self._cpu_batch.epoch

        width /= self.num_view

        if self.index + batch_size >=  width:
          batch_size = width - self.index

        labels = cpu_labels[self.index:self.index + batch_size]
        data = np.zeros((height, batch_size * self.num_view), dtype = np.float32)
        for i in range(self.num_view):
          data[:, i* batch_size: (i+ 1) * batch_size] = cpu_data[:, self.index + width * i : self.index + width * i + batch_size]

        data = copy_to_gpu(np.require(data, requirements = 'C'))
        labels = copy_to_gpu(np.require(labels, requirements = 'C'))


        self.index = (self.index + batch_size) / width
      
      #util.log_info('Batch: %s %s %s', data.shape, gpu_labels.shape, labels.shape)
      return BatchData(data, labels, epoch)
コード例 #18
0
ファイル: gICLEAN.py プロジェクト: shaoguangleo/autoFits
def cuda_hogbom(gpu_dirty,gpu_dpsf,gpu_cpsf,thresh=0.2,damp=1,gain=0.1,prefix='test'):
  """
  Use CUDA to implement the Hogbom CLEAN algorithm

  A nice description of the algorithm is given by the NRAO, here:
  http://www.cv.nrao.edu/~abridle/deconvol/node8.html

  Parameters:
  * dirty: The dirty image (2D numpy array)
  * dpsf: The dirty beam psf  (2D numpy array)
  * thresh: User-defined threshold to stop iteration, as a fraction of the max pixel intensity (float)
  * damp: The damping factor to scale the dirty beam by
  * prefix: prefix for output image file names
  """
  height,width=np.shape(gpu_dirty)
  ## Grid parameters - #improvable#
  tsize=8
  blocksize = (int(tsize),int(tsize),1)     	     # The number of threads per block (x,y,z)
  gridsize  = (int(width/tsize),int(height/tsize))   # The number of thread blocks     (x,y)
  ## Setup cleam image and point source model
  gpu_pmodel = gpu.zeros([height,width],dtype=np.float32)
  gpu_clean = gpu.zeros([height,width],dtype=np.float32)
  ## Setup GPU constants
  gpu_max_id = gpu.to_gpu(np.int32(0))
  imax=gpu_getmax(gpu_dirty)
  thresh_val=np.float32(thresh*imax)
  ## Steps 1-3 - Iterate until threshold has been reached
  t_start=time.time()
  i=0
  while abs(imax)>(thresh_val):
    if (np.mod(i,100)==0):
      print "Hogbom iteration",i
    ## Step 1 - Find max
    find_max_kernel(gpu_dirty,gpu_max_id,imax,np.int32(width),np.int32(height),gpu_pmodel,\
			block=blocksize, grid=gridsize)
    ## Step 2 - Subtract the beam (assume that it is normalized to have max 1)
    ##          This kernel simultaneously reconstructs the CLEANed image.
    if PLOTME: print "Subtracting dirty beam "+str(i)+", maxval=%0.8f"%imax+' at x='+str(gpu_max_id.get()%width)+\
			', y='+str(gpu_max_id.get()/width)
    sub_beam_kernel(gpu_dirty,gpu_dpsf,gpu_max_id,gpu_clean,gpu_cpsf,np.float32(gain*imax),np.int32(width),\
			np.int32(height), block=blocksize, grid=gridsize)
    i+=1
    ## Step 3 - Find maximum value using gpuarray
    imax=gpu_getmax(gpu_dirty)
  t_end=time.time()
  t_full=t_end-t_start
  print "Hogbom execution time %0.5f"%t_full+' s'
  print "\t%0.5f"%(t_full/i)+' s per iteration'
  ## Step 4 - Add the residuals back in
  add_noise_kernel(gpu_dirty,gpu_clean,np.float32(width+height))
  return gpu_dirty,gpu_pmodel,gpu_clean
コード例 #19
0
    def __init__(self,**params):

        '''
        Hack-ish way to avoid initialisation until the weights are transfered:
        '''
        should_apply = self.apply_output_fns_init
        params['apply_output_fns_init'] = False

        super(GPUSparseCFProjection,self).__init__(**params)
        # Transfering the weights:
        self.pycuda_stream = cuda.Stream()
        self.weights_gpu = cusparse.CSR.to_CSR(self.weights.toSparseArray().transpose())
        # Getting the row and columns indices for the *transposed* matrix. Used for Hebbian learning and normalisation:
        nzcols, nzrows = self.weights.nonzero()
        tups = sorted(zip(nzrows, nzcols))
        nzrows = [x[0] for x in tups]
        nzcols = [x[1] for x in tups]

        '''
        Allocating a page-locked piece of memory for the activity so that GPU could transfer data to the
        main memory without the involvment of the CPU:
        '''
        self.activity = cuda.pagelocked_empty(self.activity.shape, np.float32)
        self.activity_gpu_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)

        self.input_buffer_pagelocked = cuda.pagelocked_empty(shape=(self.weights_gpu.shape[1],), dtype=np.float32, mem_flags=cuda.host_alloc_flags.WRITECOMBINED)
        self.input_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[1], ), dtype=np.float32)

        self.norm_total_gpu = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)

        # Getting them on the GPU:
        self.nzcount = self.weights.getnnz()
        self.nzrows_gpu = gpuarray.to_gpu(np.array(nzrows, np.int32))
        self.nzcols_gpu = gpuarray.to_gpu(np.array(nzcols, np.int32))
        # Helper array for normalization:
        self.norm_ones_gpu = gpuarray.to_gpu(np.array([1.0] * self.weights_gpu.shape[1], np.float32))
        # Kernel that applies the normalisation:
        self.normalize_kernel = ElementwiseKernel(
                        "int *nzrows, float *norm_total, float *weights",
                        "weights[i] *= norm_total[nzrows[i]]",
                        "divisive_normalize")
        # Kernel that calculates the learning:
        self.hebbian_kernel = ElementwiseKernel(
                        "float single_conn_lr, int *row, int *col, float *src_activity, float *dest_activity, float *result",
                        "result[i] += single_conn_lr * src_activity[col[i]] * dest_activity[row[i]]",
                        "hebbian_learning")

        params['apply_output_fns_init'] = should_apply
        self.apply_output_fns_init = should_apply
        if self.apply_output_fns_init:
            self.apply_learn_output_fns()
コード例 #20
0
ファイル: cuda_alternatives.py プロジェクト: amilsted/evoMPS
    def __init__(self, p, A1, A2, l=None, r=None, left=False, pseudo=True, use_batch=False):
        assert not (pseudo and (l is None or r is None)), 'For pseudo-inverse l and r must be set!'
        
        self.use_batch = use_batch
        self.p = p
        self.left = left
        self.pseudo = pseudo
        self.D = A1[0].shape[1]
        self.shape = (self.D**2, self.D**2)
        self.dtype = A1[0].dtype
        
        self.A1G = [list(map(garr.to_gpu, A1k)) for A1k in A1]
        self.A2G = [list(map(garr.to_gpu, A2k)) for A2k in A2]
        self.tmp = list(map(garr.empty_like, self.A1G[0]))
        self.tmp2 = list(map(garr.empty_like, self.A1G[0]))

        self.l = l
        self.r = r
        self.lG = garr.to_gpu(sp.asarray(l))
        self.rG = garr.to_gpu(sp.asarray(r))
        
        self.out = garr.empty((self.D, self.D), dtype=self.dtype)
        self.out2 = garr.empty((self.D, self.D), dtype=self.dtype)
        self.xG = garr.empty((self.D, self.D), dtype=self.dtype)
        
        if use_batch:
            self.A1G_p = list(map(get_batch_ptrs, self.A1G))
            self.A2G_p = list(map(get_batch_ptrs, self.A2G))
            self.tmp_p = get_batch_ptrs(self.tmp)
            self.tmp2_p = get_batch_ptrs(self.tmp2)
            self.xG_p = get_batch_ptrs([self.xG] * len(A1[0]))
            self.out_p = get_batch_ptrs([self.out] * len(A1[0]))
            self.out2_p = get_batch_ptrs([self.out2] * len(A1[0]))
        else:
            self.A1G_p = None
            self.A2G_p = None
            self.tmp_p = None
            self.tmp2_p = None
            self.xG_p = None
            self.out_p = None
            self.out2_p = None

            self.ones = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))]
            self.ones = [one.fill(1) for one in self.ones]
            self.zeros = [garr.zeros((1), dtype=sp.complex128) for s in range(len(A1[0]))]
            
            self.streams = []
            for s in range(A1[0].shape[0]):
                self.streams.append(cd.Stream())
        
        self.hdl = cb.cublasCreate()
コード例 #21
0
ファイル: fastnet.py プロジェクト: iskandr/striate
  def append_layer(self, layer):
    self.layers.append(layer)
    if layer.type == 'conv':
      self.numConv += 1

    outputShape = layer.get_output_shape()
    row = outputShape[0] * outputShape[1] * outputShape[2]
    col = outputShape[3]
    self.inputShapes.append((row, col))
    self.imgShapes.append(outputShape)

    self.outputs.append(gpuarray.zeros((row, col), dtype=np.float32))
    self.grads.append(gpuarray.zeros(self.inputShapes[-2], dtype=np.float32))
    print >> sys.stderr,  '%s[%s]:%s' % (layer.name, layer.type, outputShape)
コード例 #22
0
ファイル: Layer.py プロジェクト: beamandrew/BNN
 def __init__(self,n_units,n_incoming,N,init_sd=1.0,precision=np.float32,magic_numbers=False):
     
     self.n_units = n_units
     self.n_incoming = n_incoming
     self.N = N
     w = np.random.normal(0,init_sd,(self.n_incoming,self.n_units))
     b = np.random.normal(0,init_sd,(1,n_units))
     
     self.weights = gpuarray.to_gpu(w.copy().astype(precision))
     self.gW = gpuarray.empty_like(self.weights)
     
     # Prior and ID must be set after creation
     self.prior = -1
     self.ID = -1
             
     self.biases = gpuarray.to_gpu(b.copy().astype(precision))
     self.gB = gpuarray.empty_like(self.biases)
         
     #Set up momentum variables for HMC sampler
     self.pW = gpuarray.to_gpu(np.random.normal(0,1,self.gW.shape))
     self.pB = gpuarray.to_gpu(np.random.normal(0,1,self.gB.shape))
     
     self.epsW = gpuarray.zeros(self.weights.shape,precision) + 1.0
     self.epsB = gpuarray.zeros(self.biases.shape,precision) + 1.0        
     
     self.precision = precision
     self.outputs = gpuarray.zeros((self.N,self.n_units),precision)   
     
     self.magic_numbers = magic_numbers
     #Define tan_h function on GPU   
     if magic_numbers:
         self.tanh = ElementwiseKernel(
             "float *x",
             "x[i] = 1.7159 * tanh(2/3*x[i]);",
             "tan_h",preamble="#include <math.h>")
     else:
         self.tanh = ElementwiseKernel(
         "float *x",
         "x[i] = tanh(min(max(-10.0,x[i]),10.0));",
         "tan_h",preamble="#include <math.h>")
     #Compile kernels 
     kernels = SourceModule(open(path+'/kernels.cu', "r").read())        
     self.add_bias_kernel = kernels.get_function("add_bias")
     
     self.rng = curandom.XORWOWRandomNumberGenerator()
     
     ##Initialize posterior weights
     self.posterior_weights = list()
     self.posterior_biases = list()
コード例 #23
0
ファイル: projection.py プロジェクト: wenqi2015/topographica
    def __init__(self,**params):
        #Hack-ish way to avoid initialisation until the weights are transfered:
        should_apply = self.apply_output_fns_init
        params['apply_output_fns_init'] = False
        super(GPUSparseCFProjection,self).__init__(**params)
        # The sparse matrix is stored in COO format, used for Hebbian learning and normalisation:
        nzcols, nzrows, values = self.weights.getTriplets()
        tups = sorted(zip(nzrows, nzcols, values))
        nzrows = np.array([x[0] for x in tups], np.int32)
        nzcols = np.array([x[1] for x in tups], np.int32)
        values = np.array([x[2] for x in tups], np.float32)
        # Getting them on the GPU:
        self.nzcount = self.weights.getnnz()
        self.nzrows_gpu = gpuarray.to_gpu(nzrows)
        self.nzcols_gpu = gpuarray.to_gpu(nzcols)
        # Setting the projection weights in CSR format for dot product calculation:
        rowPtr = cusparse.coo2csr(self.nzrows_gpu, self.weights.shape[1])
        descrA = cusparse.cusparseCreateMatDescr()
        cusparse.cusparseSetMatType(descrA, cusparse.CUSPARSE_MATRIX_TYPE_GENERAL)
        cusparse.cusparseSetMatIndexBase(descrA, cusparse.CUSPARSE_INDEX_BASE_ZERO)

        self.weights_gpu = cusparse.CSR(descrA, values, rowPtr, self.nzcols_gpu, (self.weights.shape[1], self.weights.shape[0]))
        # Allocating a page-locked piece of memory for the activity so that GPU could transfer data to the
        # main memory without the involvment of the CPU:
        self.activity = cuda.pagelocked_empty(self.activity.shape, np.float32)
        self.activity_gpu_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)

        self.input_buffer_pagelocked = cuda.pagelocked_empty(shape=(self.weights_gpu.shape[1],), dtype=np.float32, mem_flags=cuda.host_alloc_flags.WRITECOMBINED)
        self.input_buffer = gpuarray.zeros(shape=(self.weights_gpu.shape[1], ), dtype=np.float32)

        self.norm_total_gpu = gpuarray.zeros(shape=(self.weights_gpu.shape[0],), dtype=np.float32)
        # Helper array for normalization:
        self.norm_ones_gpu = gpuarray.to_gpu(np.array([1.0] * self.weights_gpu.shape[1], np.float32))
        # Kernel that applies the normalisation:
        self.normalize_kernel = ElementwiseKernel(
                        "int *nzrows, float *norm_total, float *weights",
                        "weights[i] *= norm_total[nzrows[i]]",
                        "divisive_normalize")
        # Kernel that calculates the learning:
        self.hebbian_kernel = ElementwiseKernel(
                        "float single_conn_lr, int *row, int *col, float *src_activity, float *dest_activity, float *result",
                        "result[i] += single_conn_lr * src_activity[col[i]] * dest_activity[row[i]]",
                        "hebbian_learning")
        self.pycuda_stream = cuda.Stream()
        # Finishing the initialisation that might have been delayed:
        params['apply_output_fns_init'] = should_apply
        self.apply_output_fns_init = should_apply
        if self.apply_output_fns_init:
            self.apply_learn_output_fns()
コード例 #24
0
    def reshape(self, bottom, top):
        with pu.caffe_cuda_context():

            batch_size = bottom[0].shape[0]
            if self.batch_size_ != batch_size:
                self.batch_size_ = batch_size
                self.diff_sum_ = gpuarray.zeros((batch_size, 1), dtype)
                self.diff2_sum_ = gpuarray.zeros((batch_size, 1), dtype)
                self.mask_sum_ = gpuarray.zeros((batch_size, 1), dtype)
            dim = int(np.prod(bottom[0].shape[1:]))
            if self.dim_ != dim:
                self.dim_ = dim
                self.multipier_sum_ = gpuarray.zeros((dim, 1), dtype)
                self.multipier_sum_.fill(dtype(1.0))
        top[0].reshape()
コード例 #25
0
ファイル: fastnet.py プロジェクト: phecy/striate
  def append_layer(self, layer):
    self.layers.append(layer)
    if layer.type == 'conv':
      self.numConv += 1

    outputShape = layer.get_output_shape()
    row = outputShape[1] * outputShape[2] * outputShape[3]
    col = outputShape[0]
    self.inputShapes.append((row, col))
    self.imgShapes.append(outputShape)

    self.outputs.append(gpuarray.zeros((row, col), dtype=np.float32))
    self.grads.append(gpuarray.zeros(self.inputShapes[-2], dtype=np.float32))
    print >> sys.stderr,  'append a', layer.type, 'layer', layer.name, 'to network'
    print >> sys.stderr,  'the output of the layer is', outputShape
コード例 #26
0
    def __init__( self, s_dict, synapse_state, dt, debug=False):
        self.debug = debug
        self.dt = dt
        self.num = len( s_dict['id'] )

        self.pre  = garray.to_gpu( np.asarray( s_dict['pre'], dtype=np.int32 ))
        self.ar   = garray.to_gpu( np.asarray( s_dict['ar'], dtype=np.float64 ))
        self.ad   = garray.to_gpu( np.asarray( s_dict['ad'], dtype=np.float64 ))
        self.gmax = garray.to_gpu( np.asarray( s_dict['gmax'], dtype=np.float64 ))
        self.a0   = garray.zeros( (self.num,), dtype=np.float64 )
        self.a1   = garray.zeros( (self.num,), dtype=np.float64 )
        self.a2   = garray.zeros( (self.num,), dtype=np.float64 )
        self.cond = synapse_state

        self.update = self.get_gpu_kernel()
コード例 #27
0
ファイル: random_forest.py プロジェクト: pjankiewicz/CudaTree
  def _allocate_arrays(self):
    #allocate gpu arrays and numpy arrays.
    if self.max_features < 4:
      imp_size = 4
    else:
      imp_size = self.max_features
    
    #allocate gpu arrays
    self.impurity_left = gpuarray.empty(imp_size, dtype = np.float32)
    self.impurity_right = gpuarray.empty(self.max_features, dtype = np.float32)
    self.min_split = gpuarray.empty(self.max_features, dtype = self.dtype_counts)
    self.label_total = gpuarray.empty(self.n_labels, self.dtype_indices)  
    self.label_total_2d = gpuarray.zeros(self.max_features * (self.MAX_BLOCK_PER_FEATURE + 1) * self.n_labels, 
        self.dtype_indices)
    self.impurity_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE * 2, np.float32)
    self.min_split_2d = gpuarray.empty(self.max_features * self.MAX_BLOCK_PER_FEATURE, self.dtype_counts)
    self.features_array_gpu = gpuarray.empty(self.n_features, np.uint16)
    self.mark_table = gpuarray.empty(self.stride, np.uint8) 

    #allocate numpy arrays
    self.idx_array = np.zeros(2 * self.n_samples, dtype = np.uint32)
    self.si_idx_array = np.zeros(self.n_samples, dtype = np.uint8)
    self.nid_array = np.zeros(self.n_samples, dtype = np.uint32)
    self.values_idx_array = np.zeros(2 * self.n_samples, dtype = self.dtype_indices)
    self.values_si_idx_array = np.zeros(2 * self.n_samples, dtype = np.uint8)
    self.threshold_value_idx = np.zeros(2, self.dtype_indices)
    self.min_imp_info = driver.pagelocked_zeros(4, dtype = np.float32)  
    self.features_array = driver.pagelocked_zeros(self.n_features, dtype = np.uint16)
    self.features_array[:] = np.arange(self.n_features, dtype = np.uint16)
コード例 #28
0
ファイル: layer.py プロジェクト: phecy/striate
 def __init__(self, name, input_shape):
   Layer.__init__(self, name, "softmax")
   self.inputShape = input_shape
   self.inputSize, self.batchSize = input_shape
   self.outputSize = self.inputSize
   self.cost = gpuarray.zeros((self.batchSize, 1), dtype=np.float32)
   self.batchCorrect = 0
コード例 #29
0
ファイル: softmax_layer.py プロジェクト: hani1986ye/hebel
    def __init__(self, n_in, n_out,
                 parameters=None,
                 weights_scale=None,
                 l1_penalty_weight=0., l2_penalty_weight=0.,
                 lr_multiplier=None,
                 test_error_fct='class_error'):

        # Initialize weight using Bengio's rule
        self.weights_scale = 4 * sqrt(6. / (n_in + n_out)) \
                             if weights_scale is None \
                                else weights_scale

        if parameters is not None:
            self.W, self.b = parameters
        else:
            self.W = gpuarray.empty((n_in, n_out), dtype=np.float32,
                                    allocator=memory_pool.allocate)
            sampler.fill_uniform(self.W)
            self.W = self.weights_scale * (self.W - .5)

            self.b = gpuarray.zeros((n_out,), dtype=np.float32)

        self.n_in = n_in
        self.n_out = n_out

        self.test_error_fct = test_error_fct

        self.l1_penalty_weight = l1_penalty_weight
        self.l2_penalty_weight = l2_penalty_weight

        self.lr_multiplier = 2 * [1. / np.sqrt(n_in, dtype=np.float32)] \
          if lr_multiplier is None else lr_multiplier
コード例 #30
0
    def __init__(self, n_in, n_out,
                 parameters=None,
                 weights_scale=None,
                 l1_penalty_weight=0.,
                 l2_penalty_weight=0.,
                 lr_multiplier=None):

        # Initialize weight using Bengio's rule
        self.weights_scale = 4 * sqrt(6. / (n_in + n_out)) \
                             if weights_scale is None \
                                else weights_scale

        if parameters is not None:
            self.W, self.b = parameters
        else:
            self.W = self.weights_scale * \
                     sampler.gen_uniform((n_in, n_out), dtype=np.float32) \
                     - .5 * self.weights_scale

            self.b = gpuarray.zeros((n_out,), dtype=np.float32)

        self.n_in = n_in
        self.n_out = n_out

        self.l1_penalty_weight = l1_penalty_weight
        self.l2_penalty_weight = l2_penalty_weight

        self.lr_multiplier = 2 * [1. / np.sqrt(n_in, dtype=np.float32)] \
          if lr_multiplier is None else lr_multiplier
コード例 #31
0
ファイル: tools.py プロジェクト: jamie-roberts/ARTBOX
def arr_pad(x, dims):
    """Basically zeropadding an array to ``dims`` dimensions.
    Implemented as follows:
    Write a smaller array into a bigger one. The bigger array will be created
    according to ``dims``. The place of the smaller matrix will be in the upper
    left corner of the bigger array.

    Args:
        x (gpuarray): Input array.
        dims (tuple): Dimensions of the bigger array.

    Returns:
        gpuarray: Output array of size `dims` with `x` in the upper left
            corner.
    """
    out = gpuarray.zeros(dims, x.dtype)
    arr_pad_func(x, out, np.int32(x.shape[0]), np.int32(dims[0]))
    return out
コード例 #32
0
    def _forward(self, m, v_k):
        """Forward Operator ``E^H E``

        Args:
            m (gpuarray): Input array.
            v_k (gpuarray): Output array.
        """
        tmp = gpuarray.zeros(self._dest_shape,
                             dtype=self._op.precision_complex)

        self._op.apply(m, tmp)
        self._op.adjoint(tmp, v_k)
        # v_k = v_k + bla.*m
        if self._double:
            add_scaled_vector_vector_double(v_k, v_k, self._weights, m)
        else:
            add_scaled_vector_vector(v_k, v_k, self._weights, m)
        tmp.gpudata.free()
コード例 #33
0
    def test_take_put(self):
        for n in [5, 17, 333]:
            one_field_size = 8
            buf_gpu = gpuarray.zeros(n * one_field_size, dtype=np.float32)
            dest_indices = gpuarray.to_gpu(
                np.array([0, 1, 2, 3, 32, 33, 34, 35], dtype=np.uint32))
            read_map = gpuarray.to_gpu(
                np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))

            gpuarray.multi_take_put(
                arrays=[buf_gpu for i in range(n)],
                dest_indices=dest_indices,
                src_indices=read_map,
                src_offsets=[i * one_field_size for i in range(n)],
                dest_shape=(96, ),
            )

            drv.Context.synchronize()
コード例 #34
0
ファイル: NMFGPU.py プロジェクト: ctralie/CoverSongSynthesis
def TileWDenom(WDenomIn, M):
    """
    :param WDenomIn: A T x 1 x K array that needs to be tiled
    :param M: Dimension of tile axis
    :returns: WDenomOut: A T x M x K tiled array
    """
    blockdim = 32
    T = WDenomIn.shape[0]
    K = WDenomIn.shape[2]
    GridDimT = int(np.ceil(1.0 * T / blockdim))
    GridDimK = int(np.ceil(1.0 * K / blockdim))
    T = np.array(T, dtype=np.int32)
    M = np.array(M, dtype=np.int32)
    K = np.array(K, dtype=np.int32)
    WDenomOut = gpuarray.zeros((T, M, K), np.float32)
    TileWDenom_(WDenomIn, WDenomOut, T, M, K, block=(blockdim, blockdim, 1), \
        grid=(GridDimT, GridDimK))
    return WDenomOut
コード例 #35
0
ファイル: NMFGPU.py プロジェクト: ctralie/CoverSongSynthesis
def TileHDenom(HDenomIn, N):
    """
    :param HDenomIn: A F x K x 1 array that needs to be tiled
    :param N: Dimension of tile axis
    :returns: HDenomOut: A F x K x N tiled array
    """
    blockdim = 32
    F = HDenomIn.shape[0]
    K = HDenomIn.shape[1]
    GridDimF = int(np.ceil(1.0 * F / blockdim))
    GridDimK = int(np.ceil(1.0 * K / blockdim))
    F = np.array(F, dtype=np.int32)
    K = np.array(K, dtype=np.int32)
    N = np.array(N, dtype=np.int32)
    HDenomOut = gpuarray.zeros((F, K, N), np.float32)
    TileHDenom_(HDenomIn, HDenomOut, F, K, N, block=(blockdim, blockdim, 1), \
        grid=(GridDimF, GridDimK))
    return HDenomOut
コード例 #36
0
    def test_cublasDgetrfBatched(self):
        from scipy.linalg import lu_factor
        l, m = 11, 7
        A = np.random.rand(l, m, m).astype(np.float64)
        A = np.array([np.matrix(a) * np.matrix(a).T for a in A])

        a_gpu = gpuarray.to_gpu(A)
        a_arr = bptrs(a_gpu)
        p_gpu = gpuarray.empty((l, m), np.int32)
        i_gpu = gpuarray.zeros(1, np.int32)
        X = np.array([lu_factor(a)[0] for a in A])

        cublas.cublasDgetrfBatched(self.cublas_handle, m, a_arr.gpudata, m,
                                   p_gpu.gpudata, i_gpu.gpudata, l)

        X_ = np.array([a.T for a in a_gpu.get()])

        assert np.allclose(X, X_)
コード例 #37
0
    def preclean(self):

        nx = np.int32(2 * self.imsize)

        # create fft plan nx*nx
        self.plan = fft.Plan((np.int(nx), np.int(nx)), np.complex64,
                             np.complex64)
        d_dirty = gpu.zeros((np.int(self.imsize), np.int(self.imsize)),
                            np.float32)
        gpu_im = self.cuda_gridvis(self.plan, 0, 0)

        dirty = gpu_im.get()

        if self.Debug:
            logger.debug("Plotting dirty image")

        if self.plot_me:
            pathPrefix = self.outdir
            prefix = self.uvfile
            prefix, ext = os.path.splitext(os.path.basename(prefix))
            if pathPrefix == None:
                filename = prefix + '_dirty_%dp.png' % self.chan
                fitsfile = prefix + '_dirty_%dp.fit' % self.chan
            else:
                if pathPrefix[-1:] == '/':
                    pathPrefix = pathPrefix[:-1]
                filename = pathPrefix + '/' + prefix + '_dirty_%dp.png' % self.chan
                fitsfile = pathPrefix + '/' + prefix + '_dirty_%dp.fit' % self.chan

            self.muser_draw.draw_one(filename,
                                     self.title,
                                     self.fov,
                                     dirty,
                                     self.ra - 0.5,
                                     self.ra + 0.5,
                                     self.dec - 0.5,
                                     self.dec + 0.5,
                                     16.1,
                                     axis=False,
                                     axistype=0)

        if self.writefits:
            self.write_fits(dirty, fitsfile, 'DIRTY_IMAGE')
        return filename
コード例 #38
0
    def __init__(self,
                 n_in,
                 parameters=None,
                 weights_scale=None,
                 l1_penalty_weight=0.,
                 l2_penalty_weight=0.,
                 lr_multiplier=None,
                 test_error_fct='class_error'):

        # Initialize weight using Bengio's rule
        self.weights_scale = 4 * sqrt(6. / (n_in + 1)) \
                             if weights_scale is None \
                                else weights_scale

        if parameters is not None:
            self.W, self.b = parameters
        else:
            self.W = self.weights_scale * \
                     sampler.gen_uniform((n_in, 1), dtype=np.float32) \
                     - .5 * self.weights_scale

            self.b = gpuarray.zeros((1, ), dtype=np.float32)

        self.n_in = n_in

        self.test_error_fct = test_error_fct

        self.l1_penalty_weight = l1_penalty_weight
        self.l2_penalty_weight = l2_penalty_weight

        self.lr_multiplier = 2 * [1. / np.sqrt(n_in, dtype=np.float32)] \
          if lr_multiplier is None else lr_multiplier

        self.persistent_temp_objects_config = (('activations',
                                                ('batch_size', 1), np.float32),
                                               ('df_W', self.W.shape,
                                                np.float32), ('df_b',
                                                              self.b.shape,
                                                              np.float32),
                                               ('df_input', ('batch_size',
                                                             self.n_in),
                                                np.float32), ('delta',
                                                              ('batch_size',
                                                               1), np.float32))
コード例 #39
0
    def test_adjoint(self, iters=5):
        """Test the adjoint operator.

        Args:
            iters (int): number of iterations
        """
        src_shape = (self.data.nX1, self.data.nX2, 1)
        dest_shape = (self.data.nT, self.data.nC)
        u = gpuarray.zeros(src_shape, self.precision_complex, order='F')
        ut = gpuarray.zeros(src_shape, self.precision_real, order='F')
        Ku = gpuarray.zeros(dest_shape, self.precision_complex, order='F')
        v = gpuarray.zeros(dest_shape, self.precision_complex, order='F')
        vt = gpuarray.zeros(dest_shape, self.precision_real, order='F')
        Kadv = gpuarray.zeros(src_shape, self.precision_complex, order='F')

        generator = curandom.XORWOWRandomNumberGenerator()
        errors = []

        try:
            i = 0
            for i in range(iters):
                # randomness
                generator.fill_uniform(ut)
                generator.fill_uniform(vt)
                v = gpuarray_copy(vt.astype(self.precision_complex))
                u = gpuarray_copy(ut.astype(self.precision_complex))

                # apply operators
                self.apply(u, Ku)
                self.adjoint(v, Kadv)

                scp1 = dotc_gpu(Ku, v)
                scp2 = dotc_gpu(u, Kadv)
                n_Ku = dotc_gpu(Ku)
                n_Kadv = dotc_gpu(Kadv)
                n_u = dotc_gpu(u)
                n_v = dotc_gpu(v)

                errors.append(np.abs(scp1-scp2))

            print("Test " + str(i) + ": <Ku,v>=" + str(scp1) + ", <u,Kadv>=" +
                  str(scp2) + ", Error=" + str(np.abs(scp1-scp2)) +
                  ", Relative Error=" +
                  str((scp1-scp2)/(n_Ku*n_v + n_Kadv*n_u)))
        except KeyboardInterrupt:
            if len(errors) == 0:
                errors = -1
        finally:
            print("Mean Error: " + repr(np.mean(errors)))
            print("Standarddeviation: " + repr(np.std(errors)))
        return i
コード例 #40
0
ファイル: test_gpuarray.py プロジェクト: Toonggg/pycuda
    def test_complex_bits(self):
        from pycuda.curandom import rand as curand

        if has_double_support():
            dtypes = [np.complex64, np.complex128]
        else:
            dtypes = [np.complex64]

        n = 20
        for tp in dtypes:
            dtype = np.dtype(tp)
            from pytools import match_precision

            real_dtype = match_precision(np.dtype(np.float64), dtype)

            z = curand((n,), real_dtype).astype(dtype) + 1j * curand(
                (n,), real_dtype
            ).astype(dtype)

            assert la.norm(z.get().real - z.real.get()) == 0
            assert la.norm(z.get().imag - z.imag.get()) == 0
            assert la.norm(z.get().conj() - z.conj().get()) == 0
            # verify conj with out parameter
            z_out = z.astype(np.complex64)
            assert z_out is z.conj(out=z_out)
            assert la.norm(z.get().conj() - z_out.get()) < 1e-7

            # verify contiguity is preserved
            for order in ["C", "F"]:
                # test both zero and non-zero value code paths
                z_real = gpuarray.zeros(z.shape, dtype=real_dtype, order=order)
                z2 = z.reshape(z.shape, order=order)
                for zdata in [z_real, z2]:
                    if order == "C":
                        assert zdata.flags.c_contiguous
                        assert zdata.real.flags.c_contiguous
                        assert zdata.imag.flags.c_contiguous
                        assert zdata.conj().flags.c_contiguous
                    elif order == "F":
                        assert zdata.flags.f_contiguous
                        assert zdata.real.flags.f_contiguous
                        assert zdata.imag.flags.f_contiguous
                        assert zdata.conj().flags.f_contiguous
コード例 #41
0
    def __init__(self,
                 params_dict,
                 access_buffers,
                 dt,
                 debug=False,
                 LPU_id=None,
                 cuda_verbose=True):
        if cuda_verbose:
            self.compile_options = ['--ptxas-options=-v']
        else:
            self.compile_options = []

        self.num_comps = params_dict['dummy'].size
        self.params_dict = params_dict
        self.access_buffers = access_buffers

        self.debug = debug
        self.LPU_id = LPU_id
        self.dtype = params_dict['dummy'].dtype

        self.dt = np.double(dt)
        self.ddt = np.double(1e-6)
        self.steps = np.int32(max(int(self.dt / self.ddt), 1))

        self.internal_states = {
            c: garray.zeros(self.num_comps, dtype = self.dtype)+self.internals[c] \
            for c in self.internals}

        self.inputs = {
            k: garray.empty(self.num_comps, dtype = self.access_buffers[k].dtype)\
            for k in self.accesses}

        dtypes = {'dt': self.dtype}
        dtypes.update({k: self.inputs[k].dtype for k in self.accesses})
        dtypes.update({k: self.params_dict[k].dtype for k in self.params})
        dtypes.update(
            {k: self.internal_states[k].dtype
             for k in self.internals})
        dtypes.update({
            k: self.dtype if not k == 'spike_state' else np.int32
            for k in self.updates
        })
        self.update_func = self.get_update_func(dtypes)
コード例 #42
0
ファイル: packeted.py プロジェクト: BuraaqAlrawi/SearchEngine
    def __call__(self, x, y=None):
        if y is None:
            y = gpuarray.zeros(self.shape[0], dtype=self.dtype,
                    allocator=x.allocator)

        self.get_kernel().prepared_call(
                (self.block_count, 1),
                (self.threads_per_packet, 1, 1),
                self.packet_base_rows.gpudata,
                self.thread_starts.gpudata,
                self.thread_ends.gpudata,
                self.index_array.gpudata,
                self.data_array.gpudata,
                x.gpudata,
                y.gpudata)

        self.remaining_coo_gpu(x, y)

        return y
コード例 #43
0
ファイル: test_driver.py プロジェクト: zhenv5/pycuda
    def test_large_smem(self):
        n = 4000
        mod = SourceModule("""
        #include <stdio.h>

        __global__ void kernel(int *d_data)
        {
        __shared__ int sdata[%d];
        sdata[threadIdx.x] = threadIdx.x;
        d_data[threadIdx.x] = sdata[threadIdx.x];
        }
        """ % n)

        kernel = mod.get_function("kernel")

        import pycuda.gpuarray as gpuarray
        arg = gpuarray.zeros((n,), dtype=np.float32)

        kernel(arg, block=(1,1,1,), )
コード例 #44
0
 def _pre_run(self):
     assert(self.LPU_obj)
     assert(all([var in self.memory_manager.variables
                 for var in self.variables.keys()]))
     for var, d in self.variables.items():
         v_dict =  self.memory_manager.variables[var]
         uids = []
         inds = []
         for uid in d['uids']:
             cd = self.LPU_obj.conn_dict[uid]
             assert(var in cd)
             pre = cd[var]['pre'][0]
             inds.append(v_dict['uids'][pre])
         self.dest_inds[var] = garray.to_gpu(np.array(inds,np.int32))
         self.dtypes[var] = v_dict['buffer'].dtype
         self._d_input[var] = garray.zeros(len(d['uids']),self.dtypes[var])
         self.variables[var]['input'] = np.zeros(len(d['uids']),
                                                 self.dtypes[var])
     self.pre_run()
コード例 #45
0
ファイル: FFT_solver.py プロジェクト: kparasch/PyPIC
    def __init__(self, mesh, context=None):
        '''
        Args:
            mesh The mesh on which the solver will operate. The dimensionality
                 is deducted from mesh.dimension
        '''
        # create the mesh grid and compute the greens function on it
        self.mesh = mesh
        self._context = context
        mesh_shape2 = [2*n for n in mesh.shape] # 2*nz, 2*ny, (2*nx)
        self.tmpspace = gpuarray.zeros(mesh_shape2, dtype=np.complex128)
        self.fgreentr = gpuarray.empty_like(self.tmpspace)
        sizeof_complex = np.dtype(np.complex128).itemsize

        # dimensionality function dispatch
        dim = mesh.dimension
        self._fgreen = getattr(self, '_fgreen' + str(dim) + 'd')
        self._mirror = getattr(self, '_mirror' + str(dim) + 'd')
        copy_fn = {'3d' : get_Memcpy3D_d2d, '2d': get_Memcpy2D_d2d}
        memcpy_nd = copy_fn[str(dim) + 'd']
        dim_args = mesh.shape
        self._cpyrho2tmp = memcpy_nd(
            src=None, dst=self.tmpspace, # None because src(rho) not yet known
            src_pitch=mesh.nx*sizeof_complex,
            dst_pitch=2*mesh.nx*sizeof_complex,
            dim_args=dim_args,
            itemsize=np.dtype(np.complex128).itemsize,
            src_height=mesh.ny,
            dst_height=2*mesh.ny)
        self._cpytmp2rho = memcpy_nd(
            src=self.tmpspace, dst=None, # None because dst(rho) not yet know
            src_pitch=2*mesh.nx*sizeof_complex,
            dst_pitch=mesh.nx*sizeof_complex,
            dim_args=dim_args,
            itemsize=np.dtype(np.complex128).itemsize,
            src_height=2*mesh.ny,
            dst_height=mesh.ny)
        self.plan_forward = cu_fft.Plan(
            self.tmpspace.shape, in_dtype=np.complex128,
            out_dtype=np.complex128)
        self.plan_backward = self.plan_forward

        self.setup_mesh(mesh)
コード例 #46
0
    def __init__( self, params_dict, access_buffers, dt,
                  LPU_id=None, debug=False, cuda_verbose=False):
        if cuda_verbose:
            self.compile_options = ['--ptxas-options=-v']
        else:
            self.compile_options = []

        self.debug = debug
        self.dt = dt
        self.num = params_dict['g_max'].size
        self.LPU_id = LPU_id

        self.params_dict = params_dict
        self.access_buffers = access_buffers
        self.inputs = {}
        self.inputs['V'] = garray.zeros( (self.num), dtype=np.float64 )
        print(self.accesses)

        self.update = self.get_gpu_kernel(params_dict['g_max'].dtype)
コード例 #47
0
def multiply(a, b, out=None, increment=False, stream=None):
    """Element-wise product of `a` and `b`."""

    dtype = a.dtype

    if out is None:
        out = gpuarray.zeros(a.shape, dtype=dtype)

    assert a.size == b.size
    assert a.dtype == b.dtype == out.dtype

    block = (min(a._block[0], a.size), 1, 1)
    grid = (a.size // block[0] + (a.size % block[0] != 0), 1, 1)

    hf.gpu.multiply_kernel[dtype == np.float32].prepared_async_call(
        grid, block, stream, a.gpudata, b.gpudata, out.gpudata,
        np.int32(a.size), np.int32(increment))

    return out
コード例 #48
0
ファイル: cuda.py プロジェクト: POETSII/PyOP2
    def _lmadata(self):
        if not hasattr(self, '__lmadata'):
            nentries = 0
            # dense block of rmap.arity x cmap.arity for each rmap/cmap pair
            for rmap, cmap in self.sparsity.maps:
                nentries += rmap.arity * cmap.arity

            entry_size = 0
            # all pairs of maps in the sparsity must have the same
            # iterset, there are sum(iterset.size) * nentries total
            # entries in the LMA data
            for rmap, cmap in self.sparsity.maps:
                entry_size += rmap.iterset.size
            # each entry in the block is size dims[0] x dims[1]
            entry_size *= np.asscalar(np.prod(self.dims))
            nentries *= entry_size
            setattr(self, '__lmadata',
                    gpuarray.zeros(shape=nentries, dtype=self.dtype))
        return getattr(self, '__lmadata')
コード例 #49
0
ファイル: tools.py プロジェクト: jamie-roberts/ARTBOX
def slice_coil(inp, outp=None, coil=0):
    """Returns a slice of a 3D-Array (image stack or coil sensitivity) since
    slicing is not implemented in PyCUDA.

    Args:
        inp (gpuarray): Input array.
        outp (gpuarray): Output slice (optional, if not provided, it will be
            created).
        coil (int): Coil index.

    Returns:
        gpuarray: Output array.
    """
    dim = inp.shape[0]
    n_coils = inp.shape[1]
    if outp is None:
        outp = gpuarray.zeros(dim, inp.dtype)
    slice_coil_func(outp, inp, np.int32(coil), np.int32(n_coils))
    return outp
コード例 #50
0
ファイル: hidden_layer.py プロジェクト: freedomljc2/hebel
    def __init__(self, n_in, n_units,
                 activation_function='sigmoid',
                 dropout=False,
                 parameters=None,
                 weights_scale=None,
                 l1_penalty_weight=0.,
                 l2_penalty_weight=0.,
                 lr_multiplier=None):

        self._set_activation_fct(activation_function)

        if weights_scale is None:
            self._set_weights_scale(activation_function, n_in, n_units)
        else:
            self.weights_scale = weights_scale

        if parameters is not None:
            if isinstance(parameters, basestring):
                self.parameters = cPickle.loads(open(parameters))
            else:
                self.W, self.b = parameters
        else:
            self.W = self.weights_scale * \
                     sampler.gen_uniform((n_in, n_units),
                                         dtype=np.float32) \
              - .5 * self.weights_scale

            self.b = gpuarray.zeros((n_units,), dtype=np.float32)

        assert self.W.shape == (n_in, n_units)
        assert self.b.shape == (n_units,)

        self.n_in = n_in
        self.n_units = n_units

        self.lr_multiplier = lr_multiplier if lr_multiplier is not None else \
            2 * [1. / np.sqrt(self.n_in, dtype=np.float32)]

        self.l1_penalty_weight = l1_penalty_weight
        self.l2_penalty_weight = l2_penalty_weight

        self.dropout = dropout
コード例 #51
0
ファイル: test_driver.py プロジェクト: zhenv5/pycuda
    def test_3d_fp_textures(self):
        orden = "C"
        npoints = 32

        for prec in [np.int16,np.float32,np.float64,np.complex64,np.complex128]:
            prec_str = dtype_to_ctype(prec)
            if prec == np.complex64: fpName_str = 'fp_tex_cfloat'
            elif prec == np.complex128: fpName_str = 'fp_tex_cdouble'
            elif prec == np.float64: fpName_str = 'fp_tex_double'
            else: fpName_str = prec_str
            A_cpu = np.zeros([npoints,npoints,npoints],order=orden,dtype=prec)
            A_cpu[:] = np.random.rand(npoints,npoints,npoints)[:]
            A_gpu = gpuarray.zeros(A_cpu.shape,dtype=prec,order=orden)

            myKern = '''
            #include <pycuda-helpers.hpp>
            texture<fpName, 3, cudaReadModeElementType> mtx_tex;

            __global__ void copy_texture(cuPres *dest)
            {
              int row   = blockIdx.x*blockDim.x + threadIdx.x;
              int col   = blockIdx.y*blockDim.y + threadIdx.y;
              int slice = blockIdx.z*blockDim.z + threadIdx.z;
              dest[row + col*blockDim.x*gridDim.x + slice*blockDim.x*gridDim.x*blockDim.y*gridDim.y] = fp_tex3D(mtx_tex, slice, col, row);
            }
            '''
            myKern = myKern.replace('fpName',fpName_str)
            myKern = myKern.replace('cuPres',prec_str)
            mod = SourceModule(myKern)

            copy_texture = mod.get_function("copy_texture")
            mtx_tex = mod.get_texref("mtx_tex")
            cuBlock = (8,8,8)
            if cuBlock[0]>npoints:
                cuBlock = (npoints,npoints,npoints)
            cuGrid   = (npoints//cuBlock[0]+1*(npoints % cuBlock[0] != 0 ),npoints//cuBlock[1]+1*(npoints % cuBlock[1] != 0 ),npoints//cuBlock[2]+1*(npoints % cuBlock[1] != 0 ))
            copy_texture.prepare('P',texrefs=[mtx_tex])
            cudaArray = drv.np_to_array(A_cpu,orden,allowSurfaceBind=False)
            mtx_tex.set_array(cudaArray)
            copy_texture.prepared_call(cuGrid,cuBlock,A_gpu.gpudata)
            assert np.sum(np.abs(A_gpu.get()-np.transpose(A_cpu))) == np.array(0,dtype=prec)
            A_gpu.gpudata.free()
コード例 #52
0
def crop_stack_GPU(x, sz, offset=(0,0), dtype='real'):
    
    if x.__class__ == np.ndarray:
        x = np.array(x).astype(np.float32)
        x_gpu = cua.to_gpu(x)
    elif x.__class__ == cua.GPUArray:
        x_gpu = x

    sx = x_gpu.shape
    block_size = (16,16,1)   
    grid_size = (int(np.ceil(np.float32(sx[0]*sz[0])/block_size[0])),
                 int(np.ceil(np.float32(sz[1])/block_size[1])))

    sx_before = np.array([sx[1],sx[2]])
    sx_after  = np.array(sz)
    if any(np.array([sx[1],sx[2]])-(np.array(sz))<offset):
        raise IOError('Size missmatch: Size after - size before < offset')
    

    if dtype == 'real':

        if x_gpu.dtype != np.float32:
            x_gpu = x_gpu.real

        mod = cu.module_from_buffer(cubin)
        crop_stack_Kernel = mod.get_function("crop_stack_Kernel")

        xc_gpu = cua.zeros(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.float32)

    if dtype == 'complex':
     
        mod = cu.module_from_buffer(cubin)
        crop_stack_Kernel = mod.get_function("crop_stack_ComplexKernel")
        xc_gpu = cua.empty(tuple((int(sx[0]), int(sz[0]), int(sz[1]))), np.complex64)
        
    crop_stack_Kernel(xc_gpu.gpudata, np.int32(sx[0]),
                                      np.int32(sz[0]),     np.int32(sz[1]),
                       x_gpu.gpudata, np.int32(sx[1]),     np.int32(sx[2]),
                                      np.int32(offset[0]), np.int32(offset[1]),
                                      block=block_size, grid=grid_size)
        
    return xc_gpu
コード例 #53
0
def varianza_cov(R_s, G_s, B_s):
    kernel_code = kernel_var_cov % {'BLOCK_SIZE': BLOCK_SIZE}
    mod = compiler.SourceModule(kernel_code)
    covariance_kernel = mod.get_function("CovarianceKernel")
    salida_gpu = gpuarray.zeros((3, 3), np.float32)
    Rs_gpu = gpuarray.to_gpu(R_s)
    Gs_gpu = gpuarray.to_gpu(G_s)
    Bs_gpu = gpuarray.to_gpu(B_s)
    for i in range(len(R_s)):
        covariance_kernel(
            # inputs
            Rs_gpu[i],
            Gs_gpu[i],
            Bs_gpu[i],
            # output
            salida_gpu,
            # block of multiple threads
            block=(32, 32, 1),
        )
    return salida_gpu.get()
コード例 #54
0
ファイル: test_gpuarray.py プロジェクト: spatel81/pycuda
    def test_ranged_elwise_kernel(self):
        from pycuda.elementwise import ElementwiseKernel
        set_to_seven = ElementwiseKernel("float *z", "z[i] = 7",
                                         "set_to_seven")

        for i, slc in enumerate([
                slice(5, 20000),
                slice(5, 20000, 17),
                slice(3000, 5, -1),
                slice(1000, -1),
        ]):

            a_gpu = gpuarray.zeros((50000, ), dtype=np.float32)
            a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype)

            a_cpu[slc] = 7
            set_to_seven(a_gpu, slice=slc)
            drv.Context.synchronize()

            assert la.norm(a_cpu - a_gpu.get()) == 0, i
コード例 #55
0
ファイル: recorder.py プロジェクト: TK-21st/neural
    def __init__(self, obj, attrs, steps, **kwargs):

        super(CUDARecorder, self).__init__(obj, attrs, steps, **kwargs)

        gpu_buffer = kwargs.pop('gpu_buffer', False)
        if gpu_buffer:
            self.buffer_length = self._get_buffer_length(gpu_buffer)
            self.gpu_dct = {}
            for key in attrs:
                src = getattr(self.obj, key)
                shape = (self.buffer_length, src.size)
                self.gpu_dct[key] = garray.zeros(shape, dtype=src.dtype)
            self._update = self._copy_memory_dtod
        else:
            self._update = self._copy_memory_dtoh

        if PY2:
            self.get_buffer = self._py2_get_buffer
        if PY3:
            self.get_buffer = self._py3_get_buffer
コード例 #56
0
ファイル: em.py プロジェクト: xuf12/aspire_refactored
    def __calc_A_shift_gpu(self, shift_x, shift_y):

        psis_gpu = self.converter.get_prolates_as_images(
        )  # TODO: need to assert that returns indeed a gpuarray
        n_psis = len(psis_gpu)

        if shift_x == 0 and shift_y == 0:
            return np.eye(n_psis)

        A_shift = gpuarray.zeros((n_psis, n_psis), 'complex64')
        non_neg_freqs = self.converter.get_non_neg_freq_inds()

        psis_gpu_non_neg_freqs = psis_gpu[non_neg_freqs]
        psis_non_neg_shifted = circ_shift_kernel.circ_shift(
            psis_gpu_non_neg_freqs, shift_x, shift_y)

        psis_non_neg_shifted = self.converter.mask_points_inside_the_circle(
            psis_non_neg_shifted)

        psis_non_neg_shifted = psis_non_neg_shifted.reshape(
            len(psis_non_neg_shifted), -1)
        psis_gpu = psis_gpu.reshape(n_psis, -1)
        A_shift[non_neg_freqs] = linalg.dot(psis_non_neg_shifted,
                                            psis_gpu,
                                            transb='C')

        zero_freq_inds = self.converter.get_zero_freq_inds()
        pos_freq_inds = self.converter.get_pos_freq_inds()
        neg_freq_inds = self.converter.get_neg_freq_inds()

        A_shift[neg_freq_inds, zero_freq_inds] = A_shift[pos_freq_inds,
                                                         zero_freq_inds]
        A_shift[neg_freq_inds, pos_freq_inds] = A_shift[pos_freq_inds,
                                                        neg_freq_inds]
        A_shift[neg_freq_inds, neg_freq_inds] = A_shift[pos_freq_inds,
                                                        pos_freq_inds]

        A_shift[neg_freq_inds] = linalg.conj(A_shift[neg_freq_inds])
        # TODO: get rid of the transpose
        # return np.transpose(A_shift).copy()
        return np.transpose(A_shift).get().copy()
コード例 #57
0
ファイル: cudaPatches.py プロジェクト: fsantini/DeepFat
def getPatches(complexDataset, patchSize = 5, echoes = 6, patchSpacing = 1):
    
    PATCHSIZE=patchSize
    ECHOES=echoes
    
    extract_patches = getCudaFunction(patchSize, echoes, patchSpacing)
    
    blockSize = complexDataset.shape[1]
    gridSize = complexDataset.shape[0]
    
    realDataset = complexDataset.real.astype(np.float32).flatten()
    imDataset = complexDataset.imag.astype(np.float32).flatten()
    
#    free, total = drv.mem_get_info()
#    print '%.1f %% of device memory is free before alloc.' % ((free/float(total))*100)
    
    #patchArray = np.zeros([blockSize*gridSize*2*(2*PATCHSIZE+1)*(2*PATCHSIZE+1)*ECHOES], dtype=np.float32)
    
    real_gpu = ga.to_gpu(realDataset)
    im_gpu = ga.to_gpu(imDataset)
    
    out_gpu = ga.zeros([blockSize*gridSize*2*(2*PATCHSIZE+1)*(2*PATCHSIZE+1)*ECHOES], np.float32)
    
    #extract_patches(drv.Out(patchArray), drv.In(realDataset), drv.In(imDataset), block=(blockSize,1,1), grid=(gridSize,1))
    extract_patches(out_gpu, real_gpu, im_gpu, block=(blockSize,1,1), grid=(gridSize,1))
    
#    free, total = drv.mem_get_info()
#    print '%.1f %% of device memory is free after processing.' % ((free/float(total))*100)
    
    patchArray = out_gpu.get()
    
    real_gpu.gpudata.free()
    im_gpu.gpudata.free()
    out_gpu.gpudata.free()
    
#    free, total = drv.mem_get_info()
#    print '%.1f %% of device memory is free after dealloc.' % ((free/float(total))*100)
    
    patchArray = patchArray.reshape([blockSize*gridSize, (2*PATCHSIZE+1), (2*PATCHSIZE+1), ECHOES, 2])
    
    return patchArray
コード例 #58
0
ファイル: dummysynapse.py プロジェクト: neurokernel/lamina
    def __init__( self, s_dict, synapse_state, dt, debug=False,
                 cuda_verbose = False):
        
        if cuda_verbose:
            self.compile_options = ['--ptxas-options=-v']
        else:
            self.compile_options = []
        
        self.debug = debug
        #self.dt = dt
        self.num = len( s_dict['id'] )

        if s_dict.has_key( 'delay' ):
            self.delay = garray.to_gpu(np.round(np.asarray( s_dict['delay'])*1e-3/dt ).astype(np.int32) )
        else:
            self.delay = garray.zeros( self.num, dtype=np.int32 )

        self.pre   = garray.to_gpu( np.asarray( s_dict['pre'], dtype=np.int32 ))
        self.state = synapse_state

        self.update = self.get_gpu_kernel()
コード例 #59
0
ファイル: layer.py プロジェクト: colinshane/stdp-conv-speech
    def __init__(self, layer_pre, map_num, threshold, a_plus, a_minus,
                 learning_rounds):
        super().__init__(layer_pre, (layer_pre.width, layer_pre.height), 1,
                         map_num, threshold)
        self.a_plus = np.float32(a_plus)
        self.a_minus = np.float32(a_minus)
        self.learning_rounds = learning_rounds

        self.plastic = gpuarray.zeros(shape=(1, ), dtype=np.bool)
        self.weights = gpuarray.to_gpu(
            np.random.normal(
                0.8, 0.01,
                (self.layer_size * self.layer_pre.layer_size, )).astype(
                    np.float32))
        self.g = gpuarray.to_gpu(
            np.arange(self.layer_size * self.layer_pre.layer_size).reshape(
                (self.layer_size,
                 self.layer_pre.layer_size)).transpose().astype(np.int32))
        self.label = gpuarray.empty(shape=(1, ), dtype=np.int32)

        self.reset()
コード例 #60
0
    def norm_est(self, u, iters=10):
        """Estimates norm of the operator with a power iteration.

        Args:
            u (gpuarray): input array
            iters (int): number of iterations
        """
        if self._verbose:
            print("Estimating Norm...")

        u_temp = gpuarray_copy(u)
        result = gpuarray.zeros([self.data.nC, self.data.nT],
                                self.precision_complex, order='F')

        for _ in range(0, iters):
            dot_tmp = dotc_gpu(u_temp)
            u_temp /= np.sqrt(np.abs(dot_tmp))
            self.apply(u_temp, result)
            self.adjoint(result, u_temp)
            normsqr = dotc_gpu(u_temp)
        return np.sqrt(np.abs(normsqr)/self._norm_div)