Beispiel #1
0
    def _run_opencl(self, inMat, ctx, queueList):
        for mat in inMat:
            assert self.in_size == mat.shape[1]
        iter = len(inMat)

        kernel_src = kernel_code_feedforward

        # Calculate work items
        local_x = 256
        local_y = 1
        global_x_list = []
        for mat in inMat:
            global_x = mat.shape[0]
            if global_x % local_x:
                global_x = (global_x / local_x + 1) * local_x
            global_x_list.append(global_x)
        global_y = 1

        # Build the kernel (builds for the first time, then uses cached version)
        prg = build_program(ctx,
                            kernel_src,
                            extra="-DWORK_ITEMS={} -DIN_MAT_Y={}".format(
                                local_x, inMat[0].shape[1]))

        Wtr = np.transpose(self.W)

        # Allocate OpenCL buffers
        cl_inMatList = []
        buffer_flags = cl.mem_flags.READ_ONLY | cl.mem_flags.USE_HOST_PTR
        for x in xrange(iter):
            cl_inMatList.append(
                cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(inMat[x])))
        cl_W = cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(Wtr))
        cl_b = cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(self.b))
        cl_outList = []
        for x in xrange(iter):
            cl_outList.append(
                cl.Buffer(
                    ctx, cl.mem_flags.WRITE_ONLY,
                    inMat[x].shape[0] * self.W.shape[1] * inMat[x].itemsize))

        # Run the kernel
        for x in xrange(iter):
            prg.run_layer(queueList[x], (global_x_list[x], global_y),
                          (local_x, local_y), np.int32(inMat[x].shape[0]),
                          np.int32(Wtr.shape[0]), cl_inMatList[x], cl_W, cl_b,
                          cl_outList[x])
            queueList[x].flush()

        # Copy results back to host (blocking call)
        outList = []
        for x in xrange(iter):
            outList.append(
                np.zeros((inMat[x].shape[0], self.W.shape[1]), dtype=dtype))
            cl.enqueue_copy(queueList[x], outList[x], cl_outList[x])
        return outList
Beispiel #2
0
    def _run_opencl(self, inMat, ctx, queueList):
        for mat in inMat:
            assert self.in_size == mat.shape[1]
        iter = len(inMat)
        
        outList = []
        for x in xrange(iter):
            outList.append(np.zeros((inMat[x].shape[0], self.out_size), dtype=dtype))
            
        kernel_src = kernel_code_lstm
        
        # Build the kernel (builds for the first time, then uses cached version)
        prg = build_program(ctx, kernel_src, extra=
            '-DWORK_ITEMS={} -DL_WX={} -DL_WY={}'.format(
            self.iW.shape[1], self.lW.shape[0], self.lW.shape[1]
        ))
        
        # Allocate OpenCL buffers
        cl_inMatList = []
        buffer_flags = cl.mem_flags.READ_ONLY | cl.mem_flags.USE_HOST_PTR
        for x in xrange(iter):
            cl_inMatList.append(cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(inMat[x])))
        cl_iW = cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(self.iW))
        cl_lW = cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(self.lW))
        cl_b  = cl.Buffer(ctx, buffer_flags, hostbuf=self.b)
        cl_p  = cl.Buffer(ctx, buffer_flags, hostbuf=np.ravel(self.p))
        cl_outList = []
        cl_outvW = []
        for x in xrange(iter):
            cl_outList.append(cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, outList[x].shape[0]*self.out_size*inMat[x].itemsize))
            cl_outvW.append(cl.Buffer(ctx, cl.mem_flags.READ_WRITE, outList[x].shape[0]*self.iW.shape[1]*inMat[x].itemsize))

        # Run the kernel
        for x in xrange(iter):
            prg.run_dot(queueList[x], (inMat[x].shape[0]*self.iW.shape[1], 1), (self.iW.shape[1], 1), np.int32(self.iW.shape[0]), np.int32(self.iW.shape[1]), cl_inMatList[x], cl_iW, cl_outvW[x])
            queueList[x].flush()
        for x in xrange(iter):
            prg.run_lstm_layer(queueList[x], (self.iW.shape[1], 1), (self.iW.shape[1], 1), np.int32(inMat[x].shape[0]), cl_outvW[x], cl_lW, cl_b, cl_p, cl_outList[x])
            queueList[x].flush()
        
        # Copy results back to host (blocking call)
        for x in xrange(iter):
            outRavel = np.ravel(outList[x])
            cl.enqueue_copy(queueList[x], outRavel, cl_outList[x])
            outList[x] = np.copy(np.reshape(outRavel, (outList[x].shape[0], outList[x].shape[1])))
        return outList
Beispiel #3
0
def decode_profile_opencl(ctx,
                          queue_list,
                          post_list,
                          trans_list=None,
                          log=False,
                          slip=0.0,
                          max_workgroup_size=256):
    """  Viterbi-style decoding with per-event transition weights
    (profile)
    :param post: posterior probabilities of kmers by event.
    :param trans: A generator (e.g. a :class:`ndarray`) to produce
    per-transition log-scaled weights. None == no transition weights.
    :param log: Posterior probabilities are in log-space.
    """

    fp_type = np.float32  # uses this floating point type in the kernel
    iter = len(queue_list)

    if trans_list is None:
        for x in xrange(iter):
            trans_list.append(itertools.repeat(np.zeros(3)))
    else:
        for x in xrange(iter):
            trans = trans_list[x]
            trans = np.copy(trans)
            trans[:, 1] -= _STEP_FACTOR
            trans[:, 2] -= _SKIP_FACTOR
            trans_list[x] = trans

    lpostList = []
    if fp_type == np.float32:
        slip = np.float32(slip)
        for x in xrange(iter):
            lpostList.append(post_list[x].copy().astype(np.float32))
            trans_list[x] = np.float32(trans_list[x])
    else:
        slip = np.float64(slip)
        for x in xrange(iter):
            lpostList.append(post_list[x].copy())

    cl_postList = []
    cl_transList = []
    cl_state_seqList = []
    cl_pscore_maxList = []
    state_seqList = []
    pscore_max = np.zeros(1, dtype=fp_type)
    for x in xrange(iter):
        cl_postList.append(
            cl.Buffer(ctx,
                      cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR,
                      hostbuf=np.ravel(lpostList[x])))
        cl_transList.append(
            cl.Buffer(ctx,
                      cl.mem_flags.READ_ONLY | cl.mem_flags.USE_HOST_PTR,
                      hostbuf=np.ravel(trans_list[x])))
        state_seqList.append(np.zeros(len(lpostList[x]), dtype=np.int32))
        cl_state_seqList.append(
            cl.Buffer(ctx, cl.mem_flags.READ_WRITE,
                      len(lpostList[x]) * state_seqList[x].itemsize))
        cl_pscore_maxList.append(
            cl.Buffer(ctx, cl.mem_flags.READ_WRITE, 1 * pscore_max.itemsize))

    local_x = global_x = max_workgroup_size
    local_y = global_y = 1

    prg = build_program(ctx,
                        kernel_code,
                        extra=" -DWORK_ITEMS={} -DNUM_STATES={}".format(
                            local_x, lpostList[0].shape[1]))

    for x in xrange(iter):
        prg.decode(queue_list[x], (global_x, global_y), (local_x, local_y),
                   np.int32(lpostList[x].shape[0]), slip, cl_postList[x],
                   cl_transList[x], cl_state_seqList[x], cl_pscore_maxList[x])
        queue_list[x].flush()

    pscore_maxList = []
    for x in xrange(iter):
        cl.enqueue_copy(queue_list[x], pscore_max, cl_pscore_maxList[x])
        pscore_maxList.append(pscore_max[0])
        cl.enqueue_copy(queue_list[x], state_seqList[x], cl_state_seqList[x])

    return pscore_maxList, state_seqList