def _run_opencl(self, inMat, ctx, queueList): for mat in inMat: assert self.in_size == mat.shape[1] iter = len(inMat) kernel_src = kernel_code_feedforward # Calculate work items local_x = 256 local_y = 1 global_x_list = [] for mat in inMat: global_x = mat.shape[0] if global_x % local_x: global_x = (global_x / local_x + 1) * local_x global_x_list.append(global_x) global_y = 1 # Build the kernel (builds for the first time, then uses cached version) prg = build_program(ctx, kernel_src, extra="-DWORK_ITEMS={} -DIN_MAT_Y={}".format( local_x, inMat[0].shape[1])) Wtr = np.transpose(self.W) # Allocate OpenCL buffers cl_inMatList = [] buffer_flags = cl.mem_flags.READ_ONLY | cl.mem_flags.USE_HOST_PTR for x in xrange(iter): cl_inMatList.append( cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(inMat[x]))) cl_W = cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(Wtr)) cl_b = cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(self.b)) cl_outList = [] for x in xrange(iter): cl_outList.append( cl.Buffer( ctx, cl.mem_flags.WRITE_ONLY, inMat[x].shape[0] * self.W.shape[1] * inMat[x].itemsize)) # Run the kernel for x in xrange(iter): prg.run_layer(queueList[x], (global_x_list[x], global_y), (local_x, local_y), np.int32(inMat[x].shape[0]), np.int32(Wtr.shape[0]), cl_inMatList[x], cl_W, cl_b, cl_outList[x]) queueList[x].flush() # Copy results back to host (blocking call) outList = [] for x in xrange(iter): outList.append( np.zeros((inMat[x].shape[0], self.W.shape[1]), dtype=dtype)) cl.enqueue_copy(queueList[x], outList[x], cl_outList[x]) return outList
def _run_opencl(self, inMat, ctx, queueList): for mat in inMat: assert self.in_size == mat.shape[1] iter = len(inMat) outList = [] for x in xrange(iter): outList.append(np.zeros((inMat[x].shape[0], self.out_size), dtype=dtype)) kernel_src = kernel_code_lstm # Build the kernel (builds for the first time, then uses cached version) prg = build_program(ctx, kernel_src, extra= '-DWORK_ITEMS={} -DL_WX={} -DL_WY={}'.format( self.iW.shape[1], self.lW.shape[0], self.lW.shape[1] )) # Allocate OpenCL buffers cl_inMatList = [] buffer_flags = cl.mem_flags.READ_ONLY | cl.mem_flags.USE_HOST_PTR for x in xrange(iter): cl_inMatList.append(cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(inMat[x]))) cl_iW = cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(self.iW)) cl_lW = cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(self.lW)) cl_b = cl.Buffer(ctx, buffer_flags, hostbuf=self.b) cl_p = cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(self.p)) cl_outList = [] cl_outvW = [] for x in xrange(iter): cl_outList.append(cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, outList[x].shape[0]*self.out_size*inMat[x].itemsize)) cl_outvW.append(cl.Buffer(ctx, cl.mem_flags.READ_WRITE, outList[x].shape[0]*self.iW.shape[1]*inMat[x].itemsize)) # Run the kernel for x in xrange(iter): prg.run_dot(queueList[x], (inMat[x].shape[0]*self.iW.shape[1], 1), (self.iW.shape[1], 1), np.int32(self.iW.shape[0]), np.int32(self.iW.shape[1]), cl_inMatList[x], cl_iW, cl_outvW[x]) queueList[x].flush() for x in xrange(iter): prg.run_lstm_layer(queueList[x], (self.iW.shape[1], 1), (self.iW.shape[1], 1), np.int32(inMat[x].shape[0]), cl_outvW[x], cl_lW, cl_b, cl_p, cl_outList[x]) queueList[x].flush() # Copy results back to host (blocking call) for x in xrange(iter): outRavel = np.ravel(outList[x]) cl.enqueue_copy(queueList[x], outRavel, cl_outList[x]) outList[x] = np.copy(np.reshape(outRavel, (outList[x].shape[0], outList[x].shape[1]))) return outList
def decode_profile_opencl(ctx, queue_list, post_list, trans_list=None, log=False, slip=0.0, max_workgroup_size=256): """ Viterbi-style decoding with per-event transition weights (profile) :param post: posterior probabilities of kmers by event. :param trans: A generator (e.g. a :class:`ndarray`) to produce per-transition log-scaled weights. None == no transition weights. :param log: Posterior probabilities are in log-space. """ fp_type = np.float32 # uses this floating point type in the kernel iter = len(queue_list) if trans_list is None: for x in xrange(iter): trans_list.append(itertools.repeat(np.zeros(3))) else: for x in xrange(iter): trans = trans_list[x] trans = np.copy(trans) trans[:, 1] -= _STEP_FACTOR trans[:, 2] -= _SKIP_FACTOR trans_list[x] = trans lpostList = [] if fp_type == np.float32: slip = np.float32(slip) for x in xrange(iter): lpostList.append(post_list[x].copy().astype(np.float32)) trans_list[x] = np.float32(trans_list[x]) else: slip = np.float64(slip) for x in xrange(iter): lpostList.append(post_list[x].copy()) cl_postList = [] cl_transList = [] cl_state_seqList = [] cl_pscore_maxList = [] state_seqList = [] pscore_max = np.zeros(1, dtype=fp_type) for x in xrange(iter): cl_postList.append( cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR, hostbuf=np.ravel(lpostList[x]))) cl_transList.append( cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.USE_HOST_PTR, hostbuf=np.ravel(trans_list[x]))) state_seqList.append(np.zeros(len(lpostList[x]), dtype=np.int32)) cl_state_seqList.append( cl.Buffer(ctx, cl.mem_flags.READ_WRITE, len(lpostList[x]) * state_seqList[x].itemsize)) cl_pscore_maxList.append( cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 1 * pscore_max.itemsize)) local_x = global_x = max_workgroup_size local_y = global_y = 1 prg = build_program(ctx, kernel_code, extra=" -DWORK_ITEMS={} -DNUM_STATES={}".format( local_x, lpostList[0].shape[1])) for x in xrange(iter): prg.decode(queue_list[x], (global_x, global_y), (local_x, local_y), np.int32(lpostList[x].shape[0]), slip, cl_postList[x], cl_transList[x], cl_state_seqList[x], cl_pscore_maxList[x]) queue_list[x].flush() pscore_maxList = [] for x in xrange(iter): cl.enqueue_copy(queue_list[x], pscore_max, cl_pscore_maxList[x]) pscore_maxList.append(pscore_max[0]) cl.enqueue_copy(queue_list[x], state_seqList[x], cl_state_seqList[x]) return pscore_maxList, state_seqList