Esempio n. 1
0
    def compile_(self):
        self._generate_h()
        self._generate_cu()

        # TODO nvcc path
        if CUDAGenerator.compile_(
                src='{}/code_gen/cudamemop.cu'.format(pkg_dir),
                output='{}/so_gen/cudamemop.so'.format(pkg_dir)):
            self._so = cdll.LoadLibrary(
                '{}/so_gen/cudamemop.so'.format(pkg_dir))
            for i in self.types:
                getattr(self._so, 'gpu_malloc_{}'.format(i)).restype = POINTER(
                    getattr(ctypes, 'c_%s' % i))
                getattr(self._so, 'to_gpu_{}'.format(i)).restype = POINTER(
                    getattr(ctypes, 'c_%s' % i))
                getattr(self._so, 'from_gpu_{}'.format(i)).restype = POINTER(
                    getattr(ctypes, 'c_%s' % i))
        else:
            self._so = None
            raise EnvironmentError('Compile file connection.data.so failed')
Esempio n. 2
0
    def compile_(self):
        if self._cls_so is None:
            self._generate_h()
            self._generate_data_cu()

            if CUDAGenerator.compile_(
                    src='{}/code_gen/connection.data.cu'.format(pkg_dir),
                    output='{}/so_gen/connection.data.so'.format(pkg_dir)):
                self._cls_so = cdll.LoadLibrary(
                    '{}/so_gen/connection.data.so'.format(pkg_dir))
                self._cls_so.to_gpu_connection.restype = POINTER(
                    self._cls_ctype)
                self._cls_so.from_gpu_connection.restype = POINTER(
                    self._cls_ctype)
            else:
                self._cls_so = None
                raise EnvironmentError(
                    'Compile file connection.data.so failed')

        self._so = self._cls_so
Esempio n. 3
0
    def compile_(self):
        self._generate_runtime()

        src = '{}/c_code/runtime.cu {}/code_gen/runtime.cu'.format(
            pkg_dir, pkg_dir)

        for model in self.neuron_models:
            src += ' {}/code_gen/{}.compute.cu '.format(
                pkg_dir, model.name.lower())

        for model in self.synapse_models:
            src += ' {}/code_gen/{}.compute.cu '.format(
                pkg_dir, model.name.lower())

        if CUDAGenerator.compile_(
                src=src, output='{}/so_gen/runtime.so'.format(pkg_dir)):
            self._so = cdll.LoadLibrary('{}/so_gen/runtime.so'.format(pkg_dir))
        else:
            self._so = None
            raise EnvironmentError('Compile file runtime.so failed')

        return
Esempio n. 4
0
    def compile_(self):
        self._generate_h()
        self._generate_data_cu()
        self.model.generate_compute_cu()

        if CUDAGenerator.compile_(
                # TODO: compute.cu
                src='{}/code_gen/{}.data.cu'.format(pkg_dir,
                                                    self.model.name.lower()),
                # src='{}/code_gen/{}.data.cu {}/code_gen/{}.compute.cu'
                #        .format(pkg_dir, self.model.name.lower(), pkg_dir, self.model.name.lower()),
                output='{}/so_gen/{}.so'.format(pkg_dir,
                                                self.model.name.lower())):
            self._so = cdll.LoadLibrary('{}/so_gen/{}.so'.format(
                pkg_dir, self.model.name.lower()))
            getattr(self._so, "to_gpu_{}".format(
                self.model.name.lower())).restype = POINTER(self.c_type)
            getattr(self._so, "from_gpu_{}".format(
                self.model.name.lower())).restype = POINTER(self.c_type)
        else:
            self._so = None
            raise EnvironmentError(
                'Compile file {}/so_gen/{}.so failed'.format(
                    pkg_dir, self.model.name.lower()))
Esempio n. 5
0
    def _generate_data_cu(self):
        cu_gen = CUDAGenerator(
            '{}/code_gen/connection.data.cu'.format(pkg_dir))

        cu_gen.blank_line(2)
        if self.debug:
            cu_gen.include_std('stdio.h')

        cu_gen.include_std('stdlib.h')
        cu_gen.blank_line()
        cu_gen.include('../c_code/helper_cuda.h')
        cu_gen.include('connection.h')
        cu_gen.blank_line(2)

        cu_gen.line_no_end("CConnection * to_gpu_connection(CConnection *cpu)",
                           0)
        cu_gen.open_brace()
        cu_gen.line(
            'CConnection * gpu = (CConnection*)malloc(sizeof(CConnection))')
        cu_gen.line('gpu->n_len = cpu->n_len')
        cu_gen.line('gpu->r_n_len = cpu->r_n_len')
        cu_gen.line('gpu->s_len = cpu->s_len')

        cu_gen.to_gpu(ret='gpu->delay_start',
                      cpu='cpu->delay_start',
                      type_='int',
                      num='cpu->n_len')
        cu_gen.to_gpu(ret='gpu->delay_num',
                      cpu='cpu->delay_num',
                      type_='int',
                      num='cpu->n_len')
        cu_gen.to_gpu(ret='gpu->rev_delay_start',
                      cpu='cpu->rev_delay_start',
                      type_='int',
                      num='cpu->r_n_len')
        cu_gen.to_gpu(ret='gpu->rev_delay_num',
                      cpu='cpu->rev_delay_num',
                      type_='int',
                      num='cpu->r_n_len')
        cu_gen.to_gpu(ret='gpu->rev_map2sid',
                      cpu='cpu->rev_map2sid',
                      type_='int',
                      num='cpu->s_len')

        cu_gen.line('CConnection * ret = NULL')
        cu_gen.to_gpu(ret='ret', cpu='gpu', type_='CConnection')

        if self.debug:
            cu_gen.line(line=r'printf("GPU CConnection Pointer: %p\n", ret)')
            cu_gen.line(
                line=
                r'printf("GPU n_len: %d r_n_len: %d s_len: %d\n", gpu->n_len, gpu->r_n_len, gpu->s_len)'
            )

        cu_gen.line('return ret')
        cu_gen.close_brace()
        cu_gen.blank_line()

        cu_gen.line_no_end(
            "CConnection * from_gpu_connection(CConnection *gpu)", 0)
        cu_gen.open_brace()
        cu_gen.from_gpu(gpu='gpu', ret='ret', type_='CConnection')

        if self.debug:
            cu_gen.line(line=r'printf("CPU CConnection Pointer: %p\n", ret)')
            cu_gen.line(
                line=
                r'printf("CPU n_len: %d r_n_len: %d s_len: %d\n", ret->n_len, ret->r_n_len, ret->s_len)'
            )

        cu_gen.line('return ret')
        cu_gen.close_brace()
        cu_gen.blank_line()

        cu_gen.close()
        return
Esempio n. 6
0
    def _generate_cu(self):
        cu_gen = CUDAGenerator('{}/code_gen/cudamemop.cu'.format(pkg_dir))

        cu_gen.blank_line(2)
        cu_gen.include("../c_code/helper_cuda.h")
        cu_gen.include("cudamemop.h")
        cu_gen.blank_line(2)

        for i in self.types:
            cu_gen.line_no_end("{}* gpu_malloc_{}(int size)".format(i, i), 0)
            cu_gen.open_brace(0)
            cu_gen.line('{} * gpu = NULL'.format(i))
            cu_gen.malloc_gpu(ret='gpu', type_=str(i), num='size')
            cu_gen.line(line='return gpu')
            cu_gen.close_brace(0)
            cu_gen.blank_line()

            cu_gen.line_no_end(
                "void cpu2gpu_{}({} *cpu, {} *gpu, int size)".format(i, i, i),
                0)
            cu_gen.open_brace(0)
            cu_gen.cpu_to_gpu(gpu='gpu', cpu='cpu', type_=str(i), num='size')
            cu_gen.close_brace(0)
            cu_gen.blank_line()

            cu_gen.line_no_end(
                "void gpu2cpu_{}({} *gpu, {} *cpu, int size)".format(i, i, i),
                0)
            cu_gen.open_brace(0)
            cu_gen.gpu_to_cpu(gpu='gpu', cpu='cpu', type_=str(i), num='size')
            cu_gen.close_brace(0)
            cu_gen.blank_line()

            cu_gen.line_no_end(
                "{}* to_gpu_{}({} *cpu, int size)".format(i, i, i), 0)
            cu_gen.open_brace(0)
            cu_gen.line(line='{} * gpu = NULL'.format(i))
            cu_gen.to_gpu(ret='gpu', cpu='cpu', type_=str(i), num='size')
            cu_gen.line(line='return gpu', tab=1)
            cu_gen.close_brace(0)
            cu_gen.blank_line()

            cu_gen.line_no_end(
                "{}* from_gpu_{}({} *gpu, int size)".format(i, i, i), 0)
            cu_gen.open_brace(0)
            cu_gen.from_gpu(gpu='gpu', ret='cpu', type_=str(i), num='size')
            cu_gen.line(line='return cpu', tab=1)
            cu_gen.close_brace(0)
            cu_gen.blank_line()

        cu_gen.blank_line()
        cu_gen.close()

        return
Esempio n. 7
0
    def _generate_runtime(self):
        h_gen = CGenerator('{}/code_gen/runtime.h'.format(pkg_dir))
        h_gen.if_define('runtime.h')
        h_gen.blank_line(2)
        h_gen.include("connection.h")
        h_gen.blank_line(2)

        h_gen.block('const int MAX_DELAY = {};'.format(self.max_delay))
        h_gen.block('const int MIN_DELAY = {};'.format(self.min_delay))
        h_gen.block('const {} G_MAX = {};'.format(real, self.g_max))
        h_gen.block('const {} G_MIN = {};'.format(real, self.g_min))

        for i in range(len(self.neuron_models)):
            block_size = 32
            h_gen.block("const int {}_BLOCK_SIZE = {};".format(
                self.neuron_models[i].name.upper(), int(block_size)))
            h_gen.block("const int {}_GRID_SIZE = {};".format(
                self.neuron_models[i].name.upper(),
                math.ceil(self.neuron_nums[i + 1] / block_size)))

        for i in range(len(self.synapse_models)):
            block_size = 128
            h_gen.block("const int {}_BLOCK_SIZE = {};".format(
                self.synapse_models[i].name.upper(), int(block_size)))
            h_gen.block("const int {}_GRID_SIZE = {};".format(
                self.synapse_models[i].name.upper(),
                math.ceil(self.synapse_nums[i + 1] / block_size)))
        h_gen.blank_line()

        h_gen.block("const int MAX_BLOCK_SIZE = {};".format(
            self.max_block_size))
        h_gen.block("const int FIRED_TABLE_SIZE = {};".format(self.neuron_num))

        h_gen.block('extern __device__ int * g_fired_table;')
        h_gen.block('extern __device__ int * g_fired_table_sizes;')

        for model in self.neuron_models:
            h_gen.block('extern __device__ int * g_active_{}_table;'.format(
                model.name.lower()))
            h_gen.block('extern __device__ int g_active_{}_table_size;'.format(
                model.name.lower()))

        for model in self.synapse_models:
            h_gen.block(
                'extern __device__ CConnection * g_connection_{};'.format(
                    model.name.lower()))

        external = set()
        for model in self.neuron_models:
            external |= set(model.parameters['external'])
        for model in self.synapse_models:
            external |= set(model.parameters['external'])
        external -= set('t')

        for i in external:
            h_gen.block('extern __device__ {} * {};'.format(real, i))
        h_gen.blank_line()

        h_gen.block('extern "C" {')
        h_gen.block('\tvoid **init_runtime(CConnection **connections);')
        h_gen.block('}')
        h_gen.blank_line()

        h_gen.block(
            '__device__ int commit2globalTable(int *shared_buf, volatile unsigned int size, '
            'int *global_buf, int * global_size, int offset);')

        h_gen.end_if_define('runtime.h')
        h_gen.close()

        cu_gen = CUDAGenerator('{}/code_gen/runtime.cu'.format(pkg_dir))

        cu_gen.include_std("stdio.h")

        cu_gen.include('../c_code/helper_cuda.h')
        cu_gen.include("runtime.h")
        cu_gen.blank_line(2)

        cu_gen.block('__device__ int * g_fired_table;')
        cu_gen.block('__device__ int * g_fired_table_sizes;')

        for model in self.neuron_models:
            cu_gen.block('__device__ int * g_active_{}_table;'.format(
                model.name.lower()))
            cu_gen.block('__device__ int g_active_{}_table_size;'.format(
                model.name.lower()))

        for model in self.synapse_models:
            cu_gen.block('__device__ CConnection * g_connection_{};'.format(
                model.name.lower()))

        external = set()
        for model in self.neuron_models:
            external |= set(model.parameters['external'])
        for model in self.synapse_models:
            external |= set(model.parameters['external'])
        external -= set('t')

        for i in external:
            cu_gen.block('__device__ {} * {};'.format(real, i))
        cu_gen.blank_line(2)

        cu_gen.block('void **init_runtime(CConnection ** connections)')
        cu_gen.block('{')
        cu_gen.block('\tint zero = 0;')
        cu_gen.block('\tint *p_int = NULL;')
        cu_gen.block('\t{} *p_{} = NULL;'.format(real, real))
        cu_gen.blank_line()

        cu_gen.block(
            '\tvoid **ret = static_cast<void**>(malloc(sizeof(void*) * {}));'.
            format(2))
        cu_gen.blank_line()

        cu_gen.malloc_symbol(symbol='g_fired_table_sizes',
                             gpu='p_int',
                             type_='int',
                             num='{}'.format(self.max_delay + 1))
        cu_gen.block('\tret[0] = static_cast<void*>(p_int);')
        cu_gen.malloc_symbol(symbol='g_fired_table',
                             gpu='p_int',
                             type_='int',
                             num='{}'.format(self.neuron_num *
                                             (self.max_delay + 1)))
        cu_gen.block('\tret[1] = static_cast<void*>(p_int);')
        # cu_gen.block('\tprintf("\\n%p, %p, %p\\n", ret, ret[0], ret[1]);')
        cu_gen.blank_line()

        for model in self.neuron_models:
            cu_gen.cu_line(
                'cudaMemcpyToSymbol(g_active_{}_table_size, &zero, sizeof(int))'
                .format(model.name.lower()))
            cu_gen.malloc_symbol(symbol='g_active_{}_table'.format(
                model.name.lower()),
                                 gpu='p_int',
                                 type_='int',
                                 num='{}'.format(self.neuron_num))
        cu_gen.block('\n')

        for i in external:
            cu_gen.malloc_symbol(symbol='{}'.format(i),
                                 gpu='p_{}'.format(real),
                                 type_=real,
                                 num='{}'.format(self.neuron_num))

        for i, model in enumerate(self.synapse_models):
            cu_gen.cu_line(
                'cudaMemcpyToSymbol(g_connection_{}, &(connections[{}]), sizeof(CConnection*))'
                .format(model.name.lower(), i))

        cu_gen.block('\treturn ret;')
        cu_gen.block('}')

        cu_gen.close()
Esempio n. 8
0
    def generate_data_cu(self, debug=False):
        cu_gen = CUDAGenerator('{}/code_gen/{}.data.cu'.format(
            pkg_dir, self.name.lower()))

        cu_gen.blank_line(2)
        cu_gen.include_std('stdlib.h')
        cu_gen.blank_line()
        cu_gen.include("../c_code/helper_cuda.h")
        cu_gen.include("{}.h".format(self.name.lower()))
        cu_gen.blank_line(2)

        cu_gen.line_no_end(
            "{} * to_gpu_{}({} *cpu, int num)".format(self.name.capitalize(),
                                                      self.name.lower(),
                                                      self.name.capitalize()),
            0)
        cu_gen.open_brace()
        cu_gen.malloc(ret='gpu', type_=self.name.capitalize())

        for i in self.parameters['special']:
            if i != 'delay':
                cu_gen.to_gpu(ret='gpu->p_{}'.format(i),
                              cpu='cpu->p_{}'.format(i),
                              num='num',
                              type_='int')

        for i in self.parameters['constant']:
            cu_gen.to_gpu(ret='gpu->p_{}'.format(i),
                          cpu='cpu->p_{}'.format(i),
                          num='num',
                          type_=real)

        for i in self.parameters['variable']:
            cu_gen.to_gpu(ret='gpu->p_{}'.format(i),
                          cpu='cpu->p_{}'.format(i),
                          num='num',
                          type_=real)

        cu_gen.line('{} * ret = NULL'.format(self.name.capitalize()))
        cu_gen.to_gpu(ret='ret',
                      cpu='gpu',
                      num='1',
                      type_=self.name.capitalize())

        cu_gen.line('return ret')
        cu_gen.close_brace()
        cu_gen.blank_line()

        cu_gen.line_no_end(
            "{} * from_gpu_{}({} *gpu, int num)".format(
                self.name.capitalize(), self.name.lower(),
                self.name.capitalize()), 0)
        cu_gen.open_brace()
        cu_gen.from_gpu(ret='cpu', gpu='gpu', type_=self.name.capitalize())

        cu_gen.line('return cpu')
        cu_gen.close_brace()
        cu_gen.blank_line()

        cu_gen.close()
        return
Esempio n. 9
0
    def generate_compute_cu(self, debug=False):
        cu_gen = CUDAGenerator('{}/code_gen/{}.compute.cu'.format(pkg_dir, self.name))

        cu_gen.include("runtime.h")
        cu_gen.include("{}.h".format(self.name.lower()))
        cu_gen.blank_line(2)

        # cu_gen.line("__global__ void find_{}_gpu({} *data, int num, int start_id);"
        #            .format(self.name.lower(), self.name.capitalize()), 0)
        # cu_gen.line("__global__ void update_{}_gpu({} *data, int num, int start_id, int t);"
        #            .format(self.name.lower(), self.name.capitalize()), 0)
        # cu_gen.blank_line(1)

        cu_gen.block("__device__ {} _clip({} a, {} min, {} max)".format(real, real, real, real))
        cu_gen.block("{")
        cu_gen.block("\tif (a < min) {")
        cu_gen.block("\t\treturn min;")
        cu_gen.block("\t} else if (a > max) {")
        cu_gen.block("\t\treturn max;")
        cu_gen.block("\t} else {")
        cu_gen.block("\t\treturn a;")
        cu_gen.block("\t}")
        cu_gen.block("}")

        cu_gen.block("__global__ void update_{}_gpu({} *data, int num, int start_id, int t)"
                     .format(self.name.lower(), self.name.capitalize()))
        cu_gen.block("{")
        cu_gen.block("\tfor (int delta_t={}; delta_t<={}; delta_t++) {{".format("MIN_DELAY", "MAX_DELAY"))
        cu_gen.block("\t\tint block_idx = blockIdx.x;")
        cu_gen.block("\t\tint delay_idx = (t + {} + 1 - delta_t) % ( {} + 1);".format("MAX_DELAY", "MAX_DELAY"))
        # cu_gen.block("\t\tint delay_idx = (t + {} - delta_t) % ( {} + 1);".format("MAX_DELAY", "MAX_DELAY"))
        cu_gen.block("\t\tint fired_size = g_fired_table_sizes[delay_idx];")
        cu_gen.block("\t\tint num_per_block = (fired_size - 1) / gridDim.x + 1;")
        cu_gen.block("\t\tint block_nums_minus_1 = (fired_size - 1) / num_per_block;")
        cu_gen.block("\t\tint fired_size_block = 0;")
        cu_gen.block("\t\tif (block_idx == block_nums_minus_1) {")
        cu_gen.block("\t\tfired_size_block = fired_size - block_idx * num_per_block;")
        cu_gen.block("\t\t} else if (block_idx < block_nums_minus_1) {")
        cu_gen.block("\t\tfired_size_block = num_per_block;")
        cu_gen.block("\t\t} else {")
        cu_gen.block("\t\tfired_size_block = 0;")
        cu_gen.block("\t\t}")
        cu_gen.block("\t\tfor (int idx = 0; idx < fired_size_block; idx++) {")
        cu_gen.block("\t\t\tint nid = g_fired_table[delay_idx * FIRED_TABLE_SIZE + (block_idx) * num_per_block + idx];")
        cu_gen.block("\t\t\tint start_loc = g_connection_{}->delay_start[delta_t - MIN_DELAY + nid * (MAX_DELAY - MIN_DELAY + 1)];"
                     .format(self.name.lower()))
        cu_gen.block("\t\t\tint synapse_num = g_connection_{}->delay_num[delta_t - MIN_DELAY + nid * (MAX_DELAY - MIN_DELAY + 1)];"
                     .format(self.name.lower()))
        # cu_gen.block("\t\t\tif (threadIdx.x == 0) {")
        # cu_gen.block("\t\t\t\tgLayerInput[nid]++;")
        # cu_gen.block("\t\t\t}")
        cu_gen.block("\t\t\tfor (int j=threadIdx.x; j < synapse_num; j += blockDim.x) {")
        cu_gen.block("\t\t\t\tint sid = j+start_loc;")
        cu_gen.block("\t\t\t\t{} weight = data->p_weight[sid];".format(real))
        cu_gen.block("\t\t\t\tif (weight >= 0) {")
        cu_gen.block("\t\t\t\t\tatomicAdd(&(g_i_exec[data->p_dst[sid]]), weight);")
        cu_gen.block("\t\t\t\t} else {")
        cu_gen.block("\t\t\t\t\tatomicAdd(&(g_i_inh[data->p_dst[sid]]), weight);")
        cu_gen.block("\t\t\t\t}")
        cu_gen.blank_line()

        if self.pre_learn:
            cu_gen.block("\t\t\t\tdata->p_apre[sid] *= exp((data->p_last_update[sid] - t) / (data->p_tau_pre[sid]));")
            cu_gen.block("\t\t\t\tdata->p_apost[sid] *= exp((data->p_last_update[sid] - t) / (data->p_tau_post[sid]));")
            cu_gen.block("")
            cu_gen.block("\t\t\t\tdata->p_apre[sid] += data->p_d_apre[sid];")
            cu_gen.block("\t\t\t\tdata->p_weight[sid] = _clip(weight + data->p_apost[sid], {}, {});"
                         .format("G_MIN", "G_MAX"))
            cu_gen.block("\t\t\t\tdata->p_last_update[sid] = t;")

        cu_gen.block("\t\t\t}")
        cu_gen.block("\t\t}")
        cu_gen.block("\t\t__syncthreads();")
        cu_gen.block("\t}")
        cu_gen.block("}")
        cu_gen.blank_line()

        if self.post_learn:
            # Nothing to do with delay
            cu_gen.block("__global__ void learn_{}_post({} * data, int num, int start_id, int t)"
                         .format(self.name.lower(), self.name.capitalize()))
            cu_gen.block("{")
            cu_gen.block("\tint block_idx = blockIdx.x;")
            cu_gen.block("\tint delay_idx = t%(MAX_DELAY+1);")
            cu_gen.block("\tint fired_size = g_fired_table_sizes[delay_idx];")
            cu_gen.block("\tint num_per_block = (fired_size - 1) / gridDim.x + 1;")
            cu_gen.block("\tint block_nums_minus_1 = (fired_size - 1) / num_per_block;")
            cu_gen.block("\tint fired_size_block = 0;")
            cu_gen.block("\tif (block_idx == block_nums_minus_1) {")
            cu_gen.block("\t\tfired_size_block = fired_size - block_idx * num_per_block;")
            cu_gen.block("\t} else if (block_idx < block_nums_minus_1) {")
            cu_gen.block("\t\tfired_size_block = num_per_block;")
            cu_gen.block("\t} else {")
            cu_gen.block("\t\tfired_size_block = 0;")
            cu_gen.block("\t}")
            cu_gen.blank_line()
            cu_gen.block("\tfor (int idx = 0; idx < fired_size_block; idx++) {")
            cu_gen.block("\t\tint nid = g_fired_table[delay_idx * FIRED_TABLE_SIZE + (block_idx) * num_per_block + idx];")
            cu_gen.block("\t\tint start_loc = g_connection_{}->rev_delay_start[nid];".format(self.name.lower()))
            cu_gen.block("\t\tint synapse_num = g_connection_{}->rev_delay_num[nid];".format(self.name.lower()))
            cu_gen.block("\t\tfor (int j=threadIdx.x; j<synapse_num; j+=blockDim.x) {")
            cu_gen.block("\t\t\tint sid = g_connection_{}->rev_map2sid[j+start_loc];".format(self.name.lower()))
            cu_gen.block("\t\t\tdata->p_apre[sid] *= exp((data->p_last_update[sid] - t) / (data->p_tau_pre[sid]));")
            cu_gen.block("\t\t\tdata->p_apost[sid] *= exp((data->p_last_update[sid] - t) / (data->p_tau_post[sid]));")
            cu_gen.block("\t\t\tdata->p_apost[sid] += data->p_d_apost[sid];")
            cu_gen.block("\t\t\tdata->p_weight[sid] = _clip(data->p_weight[sid] + data->p_apre[sid], %s, %s);" % ("G_MIN", "G_MAX"))
            cu_gen.block("\t\t\tdata->p_last_update[sid] = t;")
            cu_gen.block("\t\t}")
            cu_gen.block("\t}")
            cu_gen.block("\t__syncthreads();")
            cu_gen.block("}")
            cu_gen.blank_line()

            # cu_gen.block("__global__ void update_{}_gpu({} *data, int num, int start_id, CConnection * connection)"
            #              .format(self.name.lower(), self.name))
            # cu_gen.block("{")
            # cu_gen.block("\tfor (int delta_t={}; delta_t<={}; delta_t++) {{".format("MIN_DELAY", "MAX_DELAY"))
            # cu_gen.block("\t\tint block_idx = blockIdx.x;")
            # cu_gen.block("\t\tint time_idx = (gCurrentIdx + {} - delta_t) % ({} + 1);".format("MAX_DELAY", "MAX_DELAY"))
            # cu_gen.block("\t\tint firedSize = g_fired_tableSizes[time_idx];")
            # cu_gen.block("\t\tint num_per_block = (firedSize - 1) / gridDim.x + 1;")
            # cu_gen.block("\t\tint block_nums_minus_1 = (firedSize - 1) / num_per_block;")
            # cu_gen.block("")
            # cu_gen.block("\t\tint fired_size_block = 0;")
            # cu_gen.block("\t\tif (block_idx == block_nums_minus_1) {")
            # cu_gen.block("\t\tfired_size_block = firedSize - block_idx * num_per_block;")
            # cu_gen.block("\t\t} else if (block_idx < block_nums_minus_1) {")
            # cu_gen.block("\t\tfired_size_block = num_per_block;")
            # cu_gen.block("\t\t} else {")
            # cu_gen.block("\t\tfired_size_block = 0;")
            # cu_gen.block("\t\t}")
            # cu_gen.blank_line()
            # cu_gen.block("\t\tfor (int idx = 0; idx < fired_size_block; idx++) {")
            # cu_gen.block("\t\t\tint nid = g_fired_table[time_idx * g_fired_tableCap + (block_idx) * num_per_block + idx];")
            # cu_gen.block("\t\t\tint start_loc = connection->rev_delayStart[delta_t + nid * MAX_DELAY];")
            # cu_gen.block("\t\t\tint synapseNum = connection->rev_delayNum[delta_t + nid * MAX_DELAY];")
            # cu_gen.block("\t\t\tif (threadIdx.x == 0) {")
            # cu_gen.block("\t\t\tgLayerInput[nid]++;")
            # cu_gen.block("\t\t\t}")
            # cu_gen.block("\t\t\tfor (int j=threadIdx.x; j < synapseNum; j += blockDim.x) {")
            # cu_gen.block("\t\t\tint sid = connection->rev_map2sid[j+start_loc];")
            # cu_gen.block("")
            # cu_gen.block("\t\t\tdata->p_apre[sid] *= exp((data->p_last_update[sid] - t) / (data->p_tau_pre[sid]));")
            # cu_gen.block("\t\t\tdata->p_apost[sid] *= exp((data->p_last_update[sid] - t) / (data->p_tau_post[sid]));")
            # cu_gen.block("\t\t\tdata->p_apost[sid] += data->p_d_apost[sid];")
            # cu_gen.block("\t\t\tdata->p_weight[sid] = _clip(weight + data->p_pre[sid], %s, %s);" % ("gMin", "gMax"))
            # cu_gen.block("\t\t\tdata->p_last_update[sid] = gCurrentCycle;")
            # cu_gen.block("\t\t\t}")
            # cu_gen.block("\t\t}")
            # cu_gen.block("\t\t__syncthreads();")
            # cu_gen.block("\t}")
            # cu_gen.block("}")
            # cu_gen.blank_line()

        cu_gen.block("void update_{}({} *data, int num, int start_id, int t)"
                     .format(self.name.lower(), self.name.capitalize()))
        cu_gen.block("{")
        # cu_gen.block("\t\tfind_%s_gpu<<<size=>gridSize, size->blockSize>>>((%s*)data, num, start_id);" %
        #              (self.name.lower(), self.name.capitalize()))
        cu_gen.block("\tupdate_{}_gpu<<<{}_GRID_SIZE, {}_BLOCK_SIZE>>>(({}*)data, num, start_id, t);"
                     .format(self.name.lower(), self.name.upper(), self.name.upper(), self.name.capitalize()))
        if self.post_learn:
            cu_gen.block("\tlearn_{}_post<<<{}_GRID_SIZE, {}_BLOCK_SIZE>>>(({}*)data, num, start_id, t);"
                         .format(self.name.lower(), self.name.upper(), self.name.upper(), self.name.capitalize()))
        cu_gen.block("}")
        cu_gen.blank_line()


        cu_gen.close()
        return