def test_pydotprint_cond_highlight():
    """
    This is a REALLY PARTIAL TEST.

    I did them to help debug stuff.
    """

    # Skip test if pydot is not available.
    if not theano.printing.pydot_imported:
        raise SkipTest('pydot not available')

    x = tensor.dvector()
    f = theano.function([x], x * 2)
    f([1, 2, 3, 4])

    s = StringIO()
    new_handler = logging.StreamHandler(s)
    new_handler.setLevel(logging.DEBUG)
    orig_handler = theano.logging_default_handler

    theano.theano_logger.removeHandler(orig_handler)
    theano.theano_logger.addHandler(new_handler)
    try:
        theano.printing.pydotprint(f,
                                   cond_highlight=True,
                                   print_output_file=False)
    finally:
        theano.theano_logger.addHandler(orig_handler)
        theano.theano_logger.removeHandler(new_handler)

    assert (s.getvalue() == 'pydotprint: cond_highlight is set but there'
            ' is no IfElse node in the graph\n')
Example #2
0
def get_correct_indentation_diff(code, filename):
    """
    Generate a diff to make code correctly indented.

    :param code: a string containing a file's worth of Python code
    :param filename: the filename being considered (used in diff generation only)
    :returns: a unified diff to make code correctly indented, or
              None if code is already correctedly indented
    """
    code_buffer = StringIO(code)
    output_buffer = StringIO()
    reindenter = reindent.Reindenter(code_buffer)
    reindenter.run()
    reindenter.write(output_buffer)
    reindent_output = output_buffer.getvalue()
    output_buffer.close()
    if code != reindent_output:
        diff_generator = difflib.unified_diff(code.splitlines(True), reindent_output.splitlines(True),
                                              fromfile=filename, tofile=filename + " (reindented)")
        # work around http://bugs.python.org/issue2142
        diff_tuple = map(clean_diff_line_for_python_bug_2142, diff_generator)
        diff = "".join(diff_tuple)
        return diff
    else:
        return None
Example #3
0
def test_pydotprint_cond_highlight():
    """
    This is a REALLY PARTIAL TEST.

    I did them to help debug stuff.
    """

    # Skip test if pydot is not available.
    if not theano.printing.pydot_imported:
        raise SkipTest('pydot not available')

    x = tensor.dvector()
    f = theano.function([x], x * 2)
    f([1, 2, 3, 4])

    s = StringIO()
    new_handler = logging.StreamHandler(s)
    new_handler.setLevel(logging.DEBUG)
    orig_handler = theano.logging_default_handler

    theano.theano_logger.removeHandler(orig_handler)
    theano.theano_logger.addHandler(new_handler)
    try:
        theano.printing.pydotprint(f, cond_highlight=True,
                                   print_output_file=False)
    finally:
        theano.theano_logger.addHandler(orig_handler)
        theano.theano_logger.removeHandler(new_handler)

    assert (s.getvalue() == 'pydotprint: cond_highlight is set but there'
            ' is no IfElse node in the graph\n')
Example #4
0
    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
        #inplace version, set set z_out = z_in
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
        z_out, = outputs
        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO()

        print >> sio, """
        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
        float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];

        if (%(inplace)s
            && ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0)
                || ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 0)
                    && (CudaNdarray_HOST_DIMS(%(z_in)s)[0] == 1))))
        {
            // Work inplace on the input
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
        }
        else if (%(z_out)s
                && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] ==
                    CudaNdarray_HOST_DIMS(%(z_in)s)[0])
                && ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0)
                    || ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0)
                        && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1))))
        {
            // Work on the output
            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
                %(fail)s;
            }
        }
        else
        {
            // Copy
            Py_XDECREF(%(z_out)s);
            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
            if (!%(z_out)s)
            {
                %(fail)s;
            }
        }

        if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s,
                              %(name)s_beta, %(z_out)s))
        {
            %(fail)s;
        }
        """
        return sio.getvalue() % locals()
Example #5
0
    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
        #inplace version, set set z_out = z_in
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
        z_out, = outputs
        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO()

        print >> sio, """
        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];
        float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0];

        if (%(inplace)s
            && ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0)
                || ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 0)
                    && (CudaNdarray_HOST_DIMS(%(z_in)s)[0] == 1))))
        {
            // Work inplace on the input
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
        }
        else if (%(z_out)s
                && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] ==
                    CudaNdarray_HOST_DIMS(%(z_in)s)[0])
                && ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0)
                    || ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0)
                        && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1))))
        {
            // Work on the output
            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
                %(fail)s;
            }
        }
        else
        {
            // Copy
            Py_XDECREF(%(z_out)s);
            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
            if (!%(z_out)s)
            {
                %(fail)s;
            }
        }

        if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s,
                              %(name)s_beta, %(z_out)s))
        {
            %(fail)s;
        }
        """
        return sio.getvalue() % locals()
Example #6
0
    def c_src_kernel_Ccontiguous(self, node, nodename):
        nd = node.outputs[0].type.ndim
        sio = StringIO()
        #print 'C_SRC_KERNEL', sio.getvalue()

        for ipos, i in enumerate(node.inputs):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
        print >> sio, "static __global__ void kernel_%s_%s_Ccontiguous (unsigned int numEls" % (
            self.scalar_op.__class__.__name__, nodename)
        #declare inputs
        for ipos, i in enumerate(node.inputs):
            print >> sio, "\t,", "const float * i%i_data" % ipos
        #declare outputs
        for ipos, i in enumerate(node.outputs):
            print >> sio, "\t,", "float * o%i_data" % ipos
        print >> sio, "\t)\n{"
        print >> sio, "    const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
        print >> sio, "    const int numThreads = blockDim.x * gridDim.x;"

        # For each input that is a scalar which has been broadcasted to a tensor,
        #     load it into a local variable
        for ipos, i in enumerate(node.inputs):
            if _logical_scalar(i):
                print >> sio, "    const float ii_i%i_value = i%i_data[0];" % (
                    ipos, ipos)

        #loop over the elements to be treated by this kernel call
        print >> sio, "    for (int i = idx; i < numEls; i += numThreads) {"
        # perform the scalar operation on the input and output references
        #TODO: What if the scalar_op needs support_code??
        task_code = self.scalar_op.c_code(
            Apply(self.scalar_op, [
                scalar.Scalar(dtype=input.type.dtype)()
                for input in node.inputs
            ], [
                scalar.Scalar(dtype=output.type.dtype)()
                for output in node.outputs
            ]),
            nodename + '_scalar_'
            #, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)]
            ,
            get_str_list_logical_scalar(node, data_str='i%i_data[i]'),
            ['o%i_data[i]' % ipos for ipos, i in enumerate(node.outputs)],
            sub=dict(fail='return;'))  #TODO: set a failure code somehow!!!
        print >> sio, "       ", task_code
        print >> sio, "    }"
        print >> sio, "}"

        #print sio.getvalue()
        return sio.getvalue()
Example #7
0
    def run(replay, log=None):

        if not replay:
            log = StringIO()
        else:
            log = StringIO(log)
        record = Record(replay=replay, file_object=log)

        disturb_mem.disturb_mem()

        mode = RecordMode(record=record)

        b = sharedX(np.zeros((2, )), name='b')
        channels = OrderedDict()

        disturb_mem.disturb_mem()

        v_max = b.max(axis=0)
        v_min = b.min(axis=0)
        v_range = v_max - v_min

        updates = []
        for i, val in enumerate([
                v_max.max(),
                v_max.min(),
                v_range.max(),
        ]):
            disturb_mem.disturb_mem()
            s = sharedX(0., name='s_' + str(i))
            updates.append((s, val))

        for var in theano.gof.graph.ancestors(update for _, update in updates):
            if var.name is not None and var.name is not 'b':
                if var.name[0] != 's' or len(var.name) != 2:
                    var.name = None

        for key in channels:
            updates.append((s, channels[key]))
        f = theano.function([],
                            mode=mode,
                            updates=updates,
                            on_unused_input='ignore',
                            name='f')
        for output in f.maker.fgraph.outputs:
            mode.record.handle_line(var_descriptor(output) + '\n')
        disturb_mem.disturb_mem()
        f()

        mode.record.f.flush()

        if not replay:
            return log.getvalue()
Example #8
0
    def run(replay, log=None):

        if not replay:
            log = StringIO()
        else:
            log = StringIO(log)
        record = Record(replay=replay, file_object=log)

        disturb_mem.disturb_mem()

        mode = RecordMode(record=record)

        b = sharedX(np.zeros((2,)), name='b')
        channels = OrderedDict()

        disturb_mem.disturb_mem()

        v_max = b.max(axis=0)
        v_min = b.min(axis=0)
        v_range = v_max - v_min

        updates = []
        for i, val in enumerate([
                v_max.max(),
                v_max.min(),
                v_range.max(),
                ]):
            disturb_mem.disturb_mem()
            s = sharedX(0., name='s_' + str(i))
            updates.append((s, val))

        for var in theano.gof.graph.ancestors(update for _, update in updates):
            if var.name is not None and var.name is not 'b':
                if var.name[0] != 's' or len(var.name) != 2:
                    var.name = None

        for key in channels:
            updates.append((s, channels[key]))
        f = theano.function([], mode=mode, updates=updates,
                            on_unused_input='ignore', name='f')
        for output in f.maker.fgraph.outputs:
            mode.record.handle_line(var_descriptor(output) + '\n')
        disturb_mem.disturb_mem()
        f()

        mode.record.f.flush()

        if not replay:
            return log.getvalue()
def test_pydotprint_long_name():
    """This is a REALLY PARTIAL TEST.

    It prints a graph where there are variable and apply nodes whose long
    names are different, but not the shortened names.
    We should not merge those nodes in the dot graph.

    """

    # Skip test if pydot is not available.
    if not theano.printing.pydot_imported:
        raise SkipTest('pydot not available')

    x = tensor.dvector()
    mode = theano.compile.mode.get_default_mode().excluding("fusion")
    f = theano.function([x], [x * 2, x + x], mode=mode)
    f([1, 2, 3, 4])

    s = StringIO()
    new_handler = logging.StreamHandler(s)
    new_handler.setLevel(logging.DEBUG)
    orig_handler = theano.logging_default_handler

    theano.printing.pydotprint(f,
                               max_label_size=5,
                               print_output_file=False,
                               assert_nb_all_strings=6)
def test_pydotprint_variables():
    """
    This is a REALLY PARTIAL TEST.

    I did them to help debug stuff.

    It make sure the code run.
    """

    # Skip test if pydot is not available.
    if not theano.printing.pydot_imported:
        raise SkipTest('pydot not available')

    x = tensor.dvector()

    s = StringIO()
    new_handler = logging.StreamHandler(s)
    new_handler.setLevel(logging.DEBUG)
    orig_handler = theano.logging_default_handler

    theano.theano_logger.removeHandler(orig_handler)
    theano.theano_logger.addHandler(new_handler)
    try:
        theano.printing.pydotprint(x * 2)
        theano.printing.pydotprint_variables(x * 2)
    finally:
        theano.theano_logger.addHandler(orig_handler)
        theano.theano_logger.removeHandler(new_handler)
Example #11
0
def test_dnn_tag():
    """
    Test that if cudnn isn't avail we crash and that if it is avail, we use it.
    """
    x = T.ftensor4()
    old = theano.config.on_opt_error
    theano.config.on_opt_error = "raise"

    sio = StringIO()
    handler = logging.StreamHandler(sio)
    logging.getLogger('theano.compile.tests.test_dnn').addHandler(handler)
    # Silence original handler when intentionnally generating warning messages
    logging.getLogger('theano').removeHandler(theano.logging_default_handler)
    raised = False
    try:
        f = theano.function([x],
                            max_pool_2d(x, ds=(2, 2), ignore_border=True),
                            mode=mode_with_gpu.including("cudnn"))
    except (AssertionError, RuntimeError):
        assert not cuda.dnn.dnn_available()
        raised = True
    finally:
        theano.config.on_opt_error = old
        logging.getLogger('theano.compile.tests.test_dnn').removeHandler(
            handler)
        logging.getLogger('theano').addHandler(theano.logging_default_handler)

    if not raised:
        assert cuda.dnn.dnn_available()
        assert any([
            isinstance(n.op, cuda.dnn.GpuDnnPool)
            for n in f.maker.fgraph.toposort()
        ])
Example #12
0
    def c_src_kernel_Ccontiguous(self, node, nodename):
        sio = StringIO()
        # print 'C_SRC_KERNEL', sio.getvalue()

        for ipos, i in enumerate(node.inputs):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
        print >> sio, "static __global__ void kernel_%s_%s_Ccontiguous (unsigned int numEls" % (self.scalar_op.__class__.__name__, nodename)
        # declare inputs
        for ipos, i in enumerate(node.inputs):
            print >> sio, "\t,", "const float * i%i_data" % ipos
        # declare outputs
        for ipos, i in enumerate(node.outputs):
            print >> sio, "\t,", "float * o%i_data" % ipos
        print >> sio, "\t)\n{"
        print >> sio, "    const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
        print >> sio, "    const int numThreads = blockDim.x * gridDim.x;"

        # For each input that is a scalar which has been broadcasted to a tensor,
        #     load it into a local variable
        for ipos, i in enumerate(node.inputs):
            if _logical_scalar(i):
                print >> sio, "    const float ii_i%i_value = i%i_data[0];" % (ipos, ipos)

        # loop over the elements to be treated by this kernel call
        print >> sio, "    for (int i = idx; i < numEls; i += numThreads) {"
        # perform the scalar operation on the input and output references
        # TODO: What if the scalar_op needs support_code??
        task_code = self.scalar_op.c_code(
                Apply(self.scalar_op,
                    [scalar.Scalar(dtype=input.type.dtype).make_variable()
                     for input in node.inputs],
                    [scalar.Scalar(dtype=output.type.dtype).make_variable()
                     for output in node.outputs])
                , nodename + '_scalar_'
                #, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)]
                , get_str_list_logical_scalar(node, data_str='i%i_data[i]')
                , ['o%i_data[i]'%ipos for ipos, i in enumerate(node.outputs)]
                , sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
        print >> sio, "       ", task_code
        print >> sio, "    }"
        print >> sio, "}"

        # print sio.getvalue()
        return sio.getvalue()
Example #13
0
def profile_main():
    # This is the main function for profiling
    # We've renamed our original main() above to real_main()
    import cProfile
    import pstats
    from theano.compat.six import StringIO
    prof = cProfile.Profile()
    prof = prof.runctx("real_main()", globals(), locals())
    stream = StringIO()
    stats = pstats.Stats(prof)
    stats.sort_stats("time")  # Or cumulative
    stats.print_stats(80)  # 80 = how many to print
def test_pickle_aliased_memory():
    M = Module()
    M.x = (T.dmatrix())
    M.y = (T.dmatrix())
    a = T.dmatrix()
    M.f = Method([a], a + M.x + M.y)
    M.g = Method([a], a * M.x * M.y)

    mode = get_mode()
    m = M.make(x=numpy.zeros((4, 5)), y=numpy.ones((2, 3)), mode=mode)
    m.y = m.x[:]

    #m's x and y memory is aliased....
    m.x[0, 0] = 3.14
    assert m.y[0, 0] == 3.14

    import logging
    from theano.compat.six import StringIO

    sio = StringIO()
    handler = logging.StreamHandler(sio)
    logging.getLogger('theano.compile.function_module').addHandler(handler)
    # Silence original handler when intentionnally generating warning messages
    logging.getLogger('theano').removeHandler(theano.logging_default_handler)
    try:
        m.f.pickle_aliased_memory_strategy = 'warn'
        m.g.pickle_aliased_memory_strategy = 'warn'
        m_dup = cPickle.loads(cPickle.dumps(m, protocol=-1))
        assert sio.getvalue().startswith('aliased relat')
    finally:
        logging.getLogger('theano.compile.function_module').removeHandler(
            handler)
        logging.getLogger('theano').addHandler(theano.logging_default_handler)

    try:
        m.f.pickle_aliased_memory_strategy = 'raise'
        m.g.pickle_aliased_memory_strategy = 'raise'
        m_dup = cPickle.loads(cPickle.dumps(m, protocol=-1))
    except AliasedMemoryError, e:
        return
Example #15
0
def get_correct_indentation_diff(code, filename):
    """
    Generate a diff to make code correctly indented.

    :param code: a string containing a file's worth of Python code
    :param filename: the filename being considered (used in diff generation only)
    :returns: a unified diff to make code correctly indented, or
              None if code is already correctedly indented
    """
    code_buffer = StringIO(code)
    output_buffer = StringIO()
    reindenter = reindent.Reindenter(code_buffer)
    reindenter.run()
    reindenter.write(output_buffer)
    reindent_output = output_buffer.getvalue()
    output_buffer.close()
    if code != reindent_output:
        diff_generator = difflib.unified_diff(code.splitlines(True),
                                              reindent_output.splitlines(True),
                                              fromfile=filename,
                                              tofile=filename +
                                              " (reindented)")
        # work around http://bugs.python.org/issue2142
        diff_tuple = map(clean_diff_line_for_python_bug_2142, diff_generator)
        diff = "".join(diff_tuple)
        return diff
    else:
        return None
Example #16
0
def test_pickle_aliased_memory():
    M = Module()
    M.x = (T.dmatrix())
    M.y = (T.dmatrix())
    a = T.dmatrix()
    M.f = Method([a], a + M.x + M.y)
    M.g = Method([a], a * M.x * M.y)

    mode = get_mode()
    m = M.make(x=numpy.zeros((4,5)), y=numpy.ones((2,3)), mode=mode)
    m.y = m.x[:]

    #m's x and y memory is aliased....
    m.x[0,0] = 3.14
    assert m.y[0,0] == 3.14

    import logging
    from theano.compat.six import StringIO

    sio = StringIO()
    handler = logging.StreamHandler(sio)
    logging.getLogger('theano.compile.function_module').addHandler(handler)
    # Silence original handler when intentionnally generating warning messages
    logging.getLogger('theano').removeHandler(theano.logging_default_handler)
    try:
        m.f.pickle_aliased_memory_strategy = 'warn'
        m.g.pickle_aliased_memory_strategy = 'warn'
        m_dup = cPickle.loads(cPickle.dumps(m, protocol=-1))
        assert sio.getvalue().startswith('aliased relat')
    finally:
        logging.getLogger('theano.compile.function_module').removeHandler(handler)
        logging.getLogger('theano').addHandler(theano.logging_default_handler)

    try:
        m.f.pickle_aliased_memory_strategy = 'raise'
        m.g.pickle_aliased_memory_strategy = 'raise'
        m_dup = cPickle.loads(cPickle.dumps(m, protocol=-1))
    except AliasedMemoryError, e:
        return
Example #17
0
def get_parse_error(code):
    """
    Checks code for ambiguous tabs or other basic parsing issues.

    :param code: a string containing a file's worth of Python code
    :returns: a string containing a description of the first parse error encountered,
              or None if the code is ok
    """
    # note that this uses non-public elements from stdlib's tabnanny, because tabnanny
    # is (very frustratingly) written only to be used as a script, but using it that way
    # in this context requires writing temporarily files, running subprocesses, blah blah blah
    code_buffer = StringIO(code)
    try:
        tabnanny.process_tokens(tokenize.generate_tokens(code_buffer.readline))
    except tokenize.TokenError, err:
        return "Could not parse code: %s" % err
Example #18
0
def test_dnn_tag():
    """
    We test that if cudnn isn't avail we crash and that if it is avail, we use it.
    """
    x = T.ftensor4()
    old = theano.config.on_opt_error
    theano.config.on_opt_error = "raise"

    sio = StringIO()
    handler = logging.StreamHandler(sio)
    logging.getLogger('theano.compile.tests.test_dnn').addHandler(handler)
    # Silence original handler when intentionnally generating warning messages
    logging.getLogger('theano').removeHandler(theano.logging_default_handler)
    raised = False
    try:
        f = theano.function([x],
                            max_pool_2d(x, ds=(2, 2)),
                            mode=mode_with_gpu.including("cudnn"))
    except RuntimeError, e:
        assert not cuda.dnn.dnn_available()
        raised = True
Example #19
0
    def c_code(self, node, nodename, inp, out, sub):
        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
        typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
        typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
        itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
        itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
        itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
        itemsize_nll = numpy.dtype(node.outputs[0].dtype).itemsize
        itemsize_sm = numpy.dtype(node.outputs[1].dtype).itemsize
        itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
        x, b, y_idx = inp
        nll, sm, am = out
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_nll = node.outputs[0].dtype
        dtype_sm = node.outputs[1].dtype
        dtype_am = node.outputs[2].dtype
        classname = self.__class__.__name__
        fail = sub['fail']
        sio = StringIO()
        print >> sio, """
        if (PyGpuArray_NDIM(%(y_idx)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
            %(fail)s;
        }
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
            PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
            %(fail)s;
        }
        if (PyGpuArray_NDIM(%(b)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
            %(fail)s;
        }
        if (PyGpuArray_DIMS(%(x)s)[0] !=
            PyGpuArray_DIMS(%(y_idx)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "dimension mismatch in x,y_idx arguments");
            %(fail)s;
        }
        if (PyGpuArray_DIMS(%(x)s)[1] != PyGpuArray_DIMS(%(b)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "dimension mismatch in x,b arguments");
            %(fail)s;
        }
        if ((NULL == %(nll)s) //initial condition
            || (PyGpuArray_DIMS(%(nll)s)[0] !=
                PyGpuArray_DIMS(%(y_idx)s)[0]))
        {
            Py_XDECREF(%(nll)s);
            %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
                                %(typecode_x)s,
                                GA_C_ORDER,
                                pygpu_default_context(), Py_None);
            if (!%(nll)s) {
                %(fail)s
            }
        }
        if ((NULL == %(sm)s)
            || (PyGpuArray_DIMS(%(sm)s)[0] !=
                PyGpuArray_DIMS(%(x)s)[0])
            || (PyGpuArray_DIMS(%(sm)s)[1] !=
                PyGpuArray_DIMS(%(x)s)[1]))
        {
            Py_XDECREF(%(sm)s);
            %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
                                %(typecode_b)s,
                                GA_C_ORDER,
                                pygpu_default_context(), Py_None);
            if(!%(sm)s)
            {
                PyErr_SetString(PyExc_MemoryError,
                                "failed to alloc sm output");
                // no need to decref cnda_nll, the cleanup code should do it up
                %(fail)s;
            }
        }
        if ((NULL == %(am)s)
            || (PyGpuArray_DIMS(%(am)s)[0] !=
                PyGpuArray_DIMS(%(y_idx)s)[0]))
        {
            Py_XDECREF(%(am)s);
            %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
                                %(typecode_y_idx)s,
                                GA_C_ORDER,
                                pygpu_default_context(), Py_None);
            if(!%(am)s)
            {
                PyErr_SetString(PyExc_MemoryError,
                                "failed to alloc am output");
                // no need to decref nll and sm,
                // the cleanup code should do it up
                %(fail)s;
            }
        }
        {
            int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256;
     //TODO: launch more threads per row and do parallel sum and max reductions
            int n_threads = 1;
            int n_shared_bytes = 0; //n_threads * sizeof(dtype);


            k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>(
                PyGpuArray_DIMS(%(x)s)[0],
                PyGpuArray_DIMS(%(x)s)[1],
                (npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
                                   %(x)s->ga.offset),
                PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
                PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
                (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
                                   %(b)s->ga.offset),
                PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
                (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
                                   %(y_idx)s->ga.offset),
                PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
                (npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) +
                                   %(nll)s->ga.offset),
                PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
                (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
                                   %(sm)s->ga.offset),
                PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
                PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
                (npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
                                   %(am)s->ga.offset),
                PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
            cudaError_t err = cudaGetLastError();
            if (cudaSuccess != err)
            {
                PyErr_Format(PyExc_RuntimeError,
                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
                             "The kernel was launched with %%d threads,"
                             " %%d blocks and %%d shared memory\\n",
                             cudaGetErrorString(err),
                             n_threads, n_blocks, n_shared_bytes);
                // no need to decref output vars the cleanup code will do it
                %(fail)s;
            }
        }
        """ % locals()
        return sio.getvalue()
Example #20
0
    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
        #inplace version, set set z_out = z_in
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
        z_out, = outputs
        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO()

        print >> sio, """

        #define REAL float
        float %(name)s_a = (PyArray_TYPE(%(a)s) == NPY_FLOAT)
        ? (REAL)(((float*)%(a)s->data)[0])
        : (REAL)(((double*)%(a)s->data)[0]);

        float %(name)s_b = (PyArray_TYPE(%(b)s) == NPY_FLOAT) ?
        (REAL)(((float*)%(b)s->data)[0])
        : (REAL)(((double*)%(b)s->data)[0]);
        #undef REAL

        if (%(inplace)s
            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
            && ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
                || (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
        {
            // The input has an appropriate layout, we work inplace
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
        }
        else if (%(z_out)s
                && (%(z_out)s->nd == 2)
                && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
                    == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
                && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
                    == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
                && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
                    || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
        {
            // The existing output has an appropriate layout,
            // copy the input data into it, then work inplace
            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
                %(fail)s;
            }
        }
        else
        {
            // Copy the input, use the copy as output
            Py_XDECREF(%(z_out)s);
            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
            if (!%(z_out)s)
            {
                %(fail)s;
            }
        }

        if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
        {
            %(fail)s;
        }
        """

        return sio.getvalue() % locals()
Example #21
0
    def c_code(self, node, nodename, inp, out, sub):
        x, b, y_idx = inp
        nll, sm, am = out
        classname = self.__class__.__name__
        fail = sub['fail']
        sio = StringIO()
        print >> sio, """
        if (CudaNdarray_NDIM(%(y_idx)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
            %(fail)s;
        }
        if (CudaNdarray_NDIM(%(x)s) != 2)
        {
            PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
            %(fail)s;
        }
        if (CudaNdarray_NDIM(%(b)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
            %(fail)s;
        }
        if (CudaNdarray_HOST_DIMS(%(x)s)[0] !=
            CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "dimension mismatch in x,y_idx arguments");
            %(fail)s;
        }
        if (CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "dimension mismatch in x,b arguments");
            %(fail)s;
        }
        if ((NULL == %(nll)s) //initial condition
            || (CudaNdarray_HOST_DIMS(%(nll)s)[0] !=
                CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
        {
            Py_XDECREF(%(nll)s);
            %(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1,
                CudaNdarray_HOST_DIMS(%(y_idx)s));
            if(!%(nll)s)
            {
                %(fail)s;
            }
        }
        if ((NULL == %(sm)s)
            || (CudaNdarray_HOST_DIMS(%(sm)s)[0] !=
                CudaNdarray_HOST_DIMS(%(x)s)[0])
            || (CudaNdarray_HOST_DIMS(%(sm)s)[1] !=
                CudaNdarray_HOST_DIMS(%(x)s)[1]))
        {
            Py_XDECREF(%(sm)s);
            %(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2,
                CudaNdarray_HOST_DIMS(%(x)s));
            if(!%(sm)s)
            {
                PyErr_SetString(PyExc_MemoryError,
                                "failed to alloc sm output");
                // no need to decref cnda_nll, the cleanup code should do it up
                %(fail)s;
            }
        }
        if ((NULL == %(am)s)
            || (CudaNdarray_HOST_DIMS(%(am)s)[0] !=
                CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
        {
            Py_XDECREF(%(am)s);
            %(am)s = (CudaNdarray*) CudaNdarray_NewDims(1,
                CudaNdarray_HOST_DIMS(%(y_idx)s));
            if(!%(am)s)
            {
                PyErr_SetString(PyExc_MemoryError,
                                "failed to alloc am output");
                // no need to decref nll and sm,
                // the cleanup code should do it up
                %(fail)s;
            }
        }
        {
            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
                                    NUM_VECTOR_OP_BLOCKS);
     //TODO: launch more threads per row and do parallel sum and max reductions
            int n_threads = 1;
            int n_shared_bytes = 0; //n_threads * sizeof(float);

            k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
                CudaNdarray_HOST_DIMS(%(x)s)[0],
                CudaNdarray_HOST_DIMS(%(x)s)[1],
                CudaNdarray_DEV_DATA(%(x)s),
                CudaNdarray_HOST_STRIDES(%(x)s)[0],
                CudaNdarray_HOST_STRIDES(%(x)s)[1],
                CudaNdarray_DEV_DATA(%(b)s),
                CudaNdarray_HOST_STRIDES(%(b)s)[0],
                CudaNdarray_DEV_DATA(%(y_idx)s),
                CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
                CudaNdarray_DEV_DATA(%(nll)s),
                CudaNdarray_HOST_STRIDES(%(nll)s)[0],
                CudaNdarray_DEV_DATA(%(sm)s),
                CudaNdarray_HOST_STRIDES(%(sm)s)[0],
                CudaNdarray_HOST_STRIDES(%(sm)s)[1],
                CudaNdarray_DEV_DATA(%(am)s),
                CudaNdarray_HOST_STRIDES(%(am)s)[0]);
            CNDA_THREAD_SYNC;
            cudaError_t err = cudaGetLastError();
            if (cudaSuccess != err)
            {
                PyErr_Format(PyExc_RuntimeError,
                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
                             "The kernel was launched with %%d threads,"
                             " %%d blocks and %%d shared memory\\n",
                             cudaGetErrorString(err),
                             n_threads, n_blocks, n_shared_bytes);
                // no need to decref output vars the cleanup code will do it
                %(fail)s;
            }
        }
        """ % locals()
        return sio.getvalue()
Example #22
0
def debugprint(obj, depth=-1, print_type=False,
               file=None, ids='CHAR', stop_on_name=False):
    """Print a computation graph as text to stdout or a file.

    :type obj: Variable, Apply, or Function instance
    :param obj: symbolic thing to print
    :type depth: integer
    :param depth: print graph to this depth (-1 for unlimited)
    :type print_type: boolean
    :param print_type: whether to print the type of printed objects
    :type file: None, 'str', or file-like object
    :param file: print to this file ('str' means to return a string)
    :type ids: str
    :param ids: How do we print the identifier of the variable
                id - print the python id value
                int - print integer character
                CHAR - print capital character
                "" - don't print an identifier
    :param stop_on_name: When True, if a node in the graph has a name,
                         we don't print anything below it.

    :returns: string if `file` == 'str', else file arg

    Each line printed represents a Variable in the graph.
    The indentation of lines corresponds to its depth in the symbolic graph.
    The first part of the text identifies whether it is an input
    (if a name or type is printed) or the output of some Apply (in which case
    the Op is printed).
    The second part of the text is an identifier of the Variable.
    If print_type is True, we add a part containing the type of the Variable

    If a Variable is encountered multiple times in the depth-first search,
    it is only printed recursively the first time. Later, just the Variable
    identifier is printed.

    If an Apply has multiple outputs, then a '.N' suffix will be appended
    to the Apply's identifier, to indicate which output a line corresponds to.

    """
    if file == 'str':
        _file = StringIO()
    elif file is None:
        _file = sys.stdout
    else:
        _file = file
    done = dict()
    results_to_print = []
    order = []
    if isinstance(obj, (list, tuple)):
        lobj = obj
    else:
        lobj = [obj]
    for obj in lobj:
        if isinstance(obj, gof.Variable):
            results_to_print.append(obj)
        elif isinstance(obj, gof.Apply):
            results_to_print.extend(obj.outputs)
        elif isinstance(obj, Function):
            results_to_print.extend(obj.maker.fgraph.outputs)
            order = obj.maker.fgraph.toposort()
        elif isinstance(obj, gof.FunctionGraph):
            results_to_print.extend(obj.outputs)
            order = obj.toposort()
        elif isinstance(obj, (int, long, float, numpy.ndarray)):
            print obj
        elif isinstance(obj, (theano.In, theano.Out)):
            results_to_print.append(obj.variable)
        else:
            raise TypeError("debugprint cannot print an object of this type",
                            obj)

    scan_ops = []
    for r in results_to_print:
        # Add the parent scan op to the list as well
        if (hasattr(r.owner, 'op') and
            isinstance(r.owner.op, theano.scan_module.scan_op.Scan)):
            scan_ops.append(r)

        debugmode.debugprint(r, depth=depth, done=done, print_type=print_type,
                             file=_file, order=order, ids=ids,
                             scan_ops=scan_ops, stop_on_name=stop_on_name)
    if len(scan_ops) > 0:
        print >> file, ""
        new_prefix = ' >'
        new_prefix_child = ' >'
        print >> file, "Inner graphs of the scan ops:"

        for s in scan_ops:
            print >> file, ""
            debugmode.debugprint(s, depth=depth, done=done,
                                 print_type=print_type,
                                 file=_file, ids=ids,
                                 scan_ops=scan_ops, stop_on_name=stop_on_name)
            if hasattr(s.owner.op, 'fn'):
                # If the op was compiled, print the optimized version.
                outputs = s.owner.op.fn.maker.fgraph.outputs
            else:
                outputs = s.owner.op.output
            for idx, i in enumerate(outputs):
                if hasattr(i, 'owner') and hasattr(i.owner, 'op'):
                    if isinstance(i.owner.op, theano.scan_module.scan_op.Scan):
                        scan_ops.append(i)

                debugmode.debugprint(r=i, prefix=new_prefix,
                                     depth=depth, done=done,
                                     print_type=print_type, file=file,
                                     ids=ids, stop_on_name=stop_on_name,
                                     prefix_child=new_prefix_child,
                                     scan_ops=scan_ops)

    if file is _file:
        return file
    elif file == 'str':
        return _file.getvalue()
    else:
        _file.flush()
Example #23
0
    def c_extract(self, name, sub, check_input=True, check_broadcast=True):
        sio = StringIO()
        fail = sub['fail']
        nd = self.ndim
        print >> sio, """
        assert(py_%(name)s->ob_refcnt >= 2); // There should be at least one ref from the container object,
        // and one ref from the local scope.

        if (CudaNdarray_Check(py_%(name)s))
        {
            //fprintf(stderr, "c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
            %(name)s = (CudaNdarray*)py_%(name)s;
            //std::cerr << "c_extract " << %(name)s << '\\n';
        """ % locals()
        if (check_input):
            print >> sio, """
                if (%(name)s->nd != %(nd)s)
                {
                    PyErr_Format(PyExc_RuntimeError,
                                 "c_extract: Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s",
                                 %(name)s->nd);
                    %(name)s = NULL;
                    %(fail)s;
                }
                //std::cerr << "c_extract " << %(name)s << " nd check passed\\n";
            """ % locals()
            for i, b in enumerate(self.broadcastable):
                if b and check_broadcast:
                    print >> sio, """
                if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1)
                {
                    PyErr_Format(PyExc_RuntimeError,
                                 "c_extract: Some CudaNdarray has dim %%i on broadcastable dimension %%i",
                                 CudaNdarray_HOST_DIMS(%(name)s)[%(i)s], %(i)s);
                    %(name)s = NULL;
                    %(fail)s;
                }
                //std::cerr << "c_extract " << %(name)s << "dim check %(i)s passed\\n";
                //std::cerr << "c_extract " << %(name)s << "checking bcast %(i)s <" << %(name)s->str<< ">\\n";
                //std::cerr << "c_extract " << %(name)s->str[%(i)s] << "\\n";
                if (CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s])
                {
                    //std::cerr << "c_extract bad stride detected...\\n";
                    PyErr_Format(PyExc_RuntimeError,
                                 "c_extract: Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i",
                                 CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s], %(i)s);
                    %(name)s = NULL;
                    %(fail)s;
                }
                //std::cerr << "c_extract " << %(name)s << "bcast check %(i)s passed\\n";
                    """ % locals()
            print >> sio, """
                assert(%(name)s);
                Py_INCREF(py_%(name)s);
            }
            else if (py_%(name)s == Py_None)
            {
                PyErr_SetString(PyExc_TypeError,
                                "expected a CudaNdarray, not None");
                %(name)s = NULL;
                %(fail)s;
            }
            else
            {
                //fprintf(stderr, "FAILING c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
                PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray");
                %(name)s = NULL;
                %(fail)s;
            }
            //std::cerr << "c_extract done " << %(name)s << '\\n';
            """ % locals()
        else:
            print >> sio, """
                assert(%(name)s);
                Py_INCREF(py_%(name)s);
            }
            """ % locals()
        #print sio.getvalue()
        return sio.getvalue()
Example #24
0
    def c_code(self, node, nodename, inp, out, sub):
        typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype)
        typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype)
        typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype)
        itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize
        itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize
        itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize
        itemsize_nll = numpy.dtype(node.outputs[0].dtype).itemsize
        itemsize_sm = numpy.dtype(node.outputs[1].dtype).itemsize
        itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize
        x, b, y_idx = inp
        nll, sm, am = out
        dtype_x = node.inputs[0].dtype
        dtype_b = node.inputs[1].dtype
        dtype_y_idx = node.inputs[2].dtype
        dtype_nll = node.outputs[0].dtype
        dtype_sm = node.outputs[1].dtype
        dtype_am = node.outputs[2].dtype
        classname = self.__class__.__name__
        fail = sub['fail']
        sio = StringIO()
        print >> sio, """
        if (PyGpuArray_NDIM(%(y_idx)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
            %(fail)s;
        }
        if (PyGpuArray_NDIM(%(x)s) != 2)
        {
            PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
            %(fail)s;
        }
        if (PyGpuArray_NDIM(%(b)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
            %(fail)s;
        }
        if (PyGpuArray_DIMS(%(x)s)[0] !=
            PyGpuArray_DIMS(%(y_idx)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "dimension mismatch in x,y_idx arguments");
            %(fail)s;
        }
        if (PyGpuArray_DIMS(%(x)s)[1] != PyGpuArray_DIMS(%(b)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "dimension mismatch in x,b arguments");
            %(fail)s;
        }
        if ((NULL == %(nll)s) //initial condition
            || (PyGpuArray_DIMS(%(nll)s)[0] !=
                PyGpuArray_DIMS(%(y_idx)s)[0]))
        {
            Py_XDECREF(%(nll)s);
            %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
                                %(typecode_x)s,
                                GA_C_ORDER,
                                pygpu_default_context(), Py_None);
            if (!%(nll)s) {
                %(fail)s
            }
        }
        if ((NULL == %(sm)s)
            || (PyGpuArray_DIMS(%(sm)s)[0] !=
                PyGpuArray_DIMS(%(x)s)[0])
            || (PyGpuArray_DIMS(%(sm)s)[1] !=
                PyGpuArray_DIMS(%(x)s)[1]))
        {
            Py_XDECREF(%(sm)s);
            %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s),
                                %(typecode_b)s,
                                GA_C_ORDER,
                                pygpu_default_context(), Py_None);
            if(!%(sm)s)
            {
                PyErr_SetString(PyExc_MemoryError,
                                "failed to alloc sm output");
                // no need to decref cnda_nll, the cleanup code should do it up
                %(fail)s;
            }
        }
        if ((NULL == %(am)s)
            || (PyGpuArray_DIMS(%(am)s)[0] !=
                PyGpuArray_DIMS(%(y_idx)s)[0]))
        {
            Py_XDECREF(%(am)s);
            %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s),
                                %(typecode_y_idx)s,
                                GA_C_ORDER,
                                pygpu_default_context(), Py_None);
            if(!%(am)s)
            {
                PyErr_SetString(PyExc_MemoryError,
                                "failed to alloc am output");
                // no need to decref nll and sm,
                // the cleanup code should do it up
                %(fail)s;
            }
        }
        {
            int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256;
     //TODO: launch more threads per row and do parallel sum and max reductions
            int n_threads = 1;
            int n_shared_bytes = 0; //n_threads * sizeof(dtype);


            k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>(
                PyGpuArray_DIMS(%(x)s)[0],
                PyGpuArray_DIMS(%(x)s)[1],
                (npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) +
                                   %(x)s->ga.offset),
                PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s,
                PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s,
                (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) +
                                   %(b)s->ga.offset),
                PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s,
                (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) +
                                   %(y_idx)s->ga.offset),
                PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s,
                (npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) +
                                   %(nll)s->ga.offset),
                PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s,
                (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) +
                                   %(sm)s->ga.offset),
                PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s,
                PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s,
                (npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) +
                                   %(am)s->ga.offset),
                PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s);
            cudaError_t err = cudaGetLastError();
            if (cudaSuccess != err)
            {
                PyErr_Format(PyExc_RuntimeError,
                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
                             "The kernel was launched with %%d threads,"
                             " %%d blocks and %%d shared memory\\n",
                             cudaGetErrorString(err),
                             n_threads, n_blocks, n_shared_bytes);
                // no need to decref output vars the cleanup code will do it
                %(fail)s;
            }
        }
        """ % locals()
        return sio.getvalue()
Example #25
0
def debugprint(obj,
               depth=-1,
               print_type=False,
               file=None,
               ids='CHAR',
               stop_on_name=False):
    """Print a computation graph as text to stdout or a file.

    :type obj: Variable, Apply, or Function instance
    :param obj: symbolic thing to print
    :type depth: integer
    :param depth: print graph to this depth (-1 for unlimited)
    :type print_type: boolean
    :param print_type: whether to print the type of printed objects
    :type file: None, 'str', or file-like object
    :param file: print to this file ('str' means to return a string)
    :type ids: str
    :param ids: How do we print the identifier of the variable
                id - print the python id value
                int - print integer character
                CHAR - print capital character
                "" - don't print an identifier
    :param stop_on_name: When True, if a node in the graph has a name,
                         we don't print anything below it.

    :returns: string if `file` == 'str', else file arg

    Each line printed represents a Variable in the graph.
    The indentation of lines corresponds to its depth in the symbolic graph.
    The first part of the text identifies whether it is an input
    (if a name or type is printed) or the output of some Apply (in which case
    the Op is printed).
    The second part of the text is an identifier of the Variable.
    If print_type is True, we add a part containing the type of the Variable

    If a Variable is encountered multiple times in the depth-first search,
    it is only printed recursively the first time. Later, just the Variable
    identifier is printed.

    If an Apply has multiple outputs, then a '.N' suffix will be appended
    to the Apply's identifier, to indicate which output a line corresponds to.

    """
    if file == 'str':
        _file = StringIO()
    elif file is None:
        _file = sys.stdout
    else:
        _file = file
    done = dict()
    results_to_print = []
    order = []
    if isinstance(obj, gof.Variable):
        results_to_print.append(obj)
    elif isinstance(obj, gof.Apply):
        results_to_print.extend(obj.outputs)
    elif isinstance(obj, Function):
        results_to_print.extend(obj.maker.fgraph.outputs)
        order = obj.maker.fgraph.toposort()
    elif isinstance(obj, (list, tuple)):
        results_to_print.extend(obj)
    elif isinstance(obj, gof.FunctionGraph):
        results_to_print.extend(obj.outputs)
        order = obj.toposort()
    elif isinstance(obj, (int, long, float, numpy.ndarray)):
        print obj
    else:
        raise TypeError("debugprint cannot print an object of this type", obj)
    for r in results_to_print:
        debugmode.debugprint(r,
                             depth=depth,
                             done=done,
                             print_type=print_type,
                             file=_file,
                             order=order,
                             ids=ids,
                             stop_on_name=stop_on_name)
    if file is _file:
        return file
    elif file == 'str':
        return _file.getvalue()
    else:
        _file.flush()
Example #26
0
def test_debugprint():
    A = tensor.matrix(name='A')
    B = tensor.matrix(name='B')
    C = A + B
    C.name = 'C'
    D = tensor.matrix(name='D')
    E = tensor.matrix(name='E')

    F = D + E
    G = C + F

    # just test that it work
    debugprint(G)

    # test ids=int
    s = StringIO()
    debugprint(G, file=s, ids='int')
    s = s.getvalue()
    # The additional white space are needed!
    reference = '\n'.join([
        "Elemwise{add,no_inplace} [@0] ''   ",
        " |Elemwise{add,no_inplace} [@1] 'C'   ",
        " | |A [@2]",
        " | |B [@3]",
        " |Elemwise{add,no_inplace} [@4] ''   ",
        "   |D [@5]",
        "   |E [@6]",
    ]) + '\n'

    if s != reference:
        print('--' + s + '--')
        print('--' + reference + '--')

    assert s == reference

    # test ids=CHAR
    s = StringIO()
    debugprint(G, file=s, ids='CHAR')
    s = s.getvalue()
    # The additional white space are needed!
    reference = "\n".join([
        "Elemwise{add,no_inplace} [@A] ''   ",
        " |Elemwise{add,no_inplace} [@B] 'C'   ",
        " | |A [@C]",
        " | |B [@D]",
        " |Elemwise{add,no_inplace} [@E] ''   ",
        "   |D [@F]",
        "   |E [@G]",
    ]) + '\n'

    if s != reference:
        print('--' + s + '--')
        print('--' + reference + '--')

    assert s == reference

    # test ids=CHAR, stop_on_name=True
    s = StringIO()
    debugprint(G, file=s, ids='CHAR', stop_on_name=True)
    s = s.getvalue()
    # The additional white space are needed!
    reference = '\n'.join([
        "Elemwise{add,no_inplace} [@A] ''   ",
        " |Elemwise{add,no_inplace} [@B] 'C'   ",
        " |Elemwise{add,no_inplace} [@C] ''   ",
        "   |D [@D]",
        "   |E [@E]",
    ]) + '\n'

    if s != reference:
        print('--' + s + '--')
        print('--' + reference + '--')

    assert s == reference

    # test ids=
    s = StringIO()
    debugprint(G, file=s, ids='')
    s = s.getvalue()
    # The additional white space are needed!
    reference = '\n'.join([
        "Elemwise{add,no_inplace}  ''   ",
        " |Elemwise{add,no_inplace}  'C'   ",
        " | |A ",
        " | |B ",
        " |Elemwise{add,no_inplace}  ''   ",
        "   |D ",
        "   |E ",
    ]) + '\n'
    if s != reference:
        print('--' + s + '--')
        print('--' + reference + '--')

    assert s == reference
Example #27
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        d = dict(sub)
        nd = node.outputs[0].type.ndim
        d.update(locals())
        sio = StringIO()
        nin = len(inputs)
        nout = len(outputs)
        fail = sub['fail']
        opname = str(self.scalar_op)
        initial_dims = ','.join('1' for i in xrange(nd))
        if 1 or self.scalar_op == scalar.pow:
            print >> sio, """
        //std::cerr << "C_CODE %(opname)s START\\n";
        //standard elemwise size checks
            """ % locals()
        if nd > 0:
            print >> sio, """
            int dims[%(nd)s] = {%(initial_dims)s};
            """ % locals()
        else:
            print >> sio, """
            int *dims = NULL;
            """

        #check that all inputs have valid dimensions
        emitted_inames = {}
        for id, iname in enumerate(inputs):
            if iname in emitted_inames:
                assert emitted_inames[iname] is node.inputs[id]
                continue

            # with python 2.4 (at least), if a broadcastable pattern is made of
            # numpy.bool_ instead of bool, calling int() once is not enough.
            broadcasts = map(int, map(int, node.inputs[id].broadcastable))
            broadcasts = ', '.join(map(str, broadcasts))
            nd = node.inputs[id].ndim
            if nd > 0:
                print >> sio, """
                int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
                """ % locals()
            else:
                print >> sio, """
                int *broadcasts_%(iname)s = NULL;
                """ % locals()
            emitted_inames[iname] = node.inputs[id]

        #check that all inputs have valid dimensions
        emitted_inames = {}
        for id, iname in enumerate(inputs):
            if iname in emitted_inames:
                continue
            print >> sio, """
        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
        if (%(nd)s != %(iname)s->nd)
        {
            PyErr_Format(PyExc_TypeError,
                         "need %(nd)s dims, not %%i", %(iname)s->nd);
            %(fail)s;
        }
        for (int i = 0; i< %(nd)s; ++i)
        {
            dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];
            if ((!(broadcasts_%(iname)s[i] &&
                 CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1)) &&
                (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
            {
                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Input dimension mis-match. Input"
                             " %(id)d (indices start at 0) has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             CudaNdarray_HOST_DIMS(%(iname)s)[i],
                             dims[i]
                            );
                %(fail)s;
            }
        }
            """ % locals()
            emitted_inames[iname] = True

        #check that all outputs have valid dimensions
        for idx, oname in enumerate(outputs):
            if idx not in self.inplace_pattern.keys():
                print >> sio, """
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
            {
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
            }
        }
        if (%(oname)s && !CudaNdarray_is_c_contiguous(%(oname)s))
        {
            Py_XDECREF(%(oname)s);
            %(oname)s = NULL;
        }
        if (NULL == %(oname)s)
        {
            %(oname)s = (CudaNdarray*)CudaNdarray_New();
            if (!%(oname)s)
            {
                //error string already set
                %(fail)s;
            }
            if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims))
            {
                //error string already set
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                %(fail)s;
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
            else:
                input_idx = self.inplace_pattern[idx]
                iname = inputs[input_idx]
                print >> sio, """
        Py_XDECREF(%(oname)s);
        %(oname)s = %(iname)s;
        Py_INCREF(%(oname)s);
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
            {
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Output dimension mis-match. Output"
                             " %(idx)d (indices start at 0), working inplace"
                             " on input %(input_idx)s, has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             CudaNdarray_HOST_DIMS(%(oname)s)[i],
                             dims[i]
                            );
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                %(fail)s;
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()

        print >> sio, """
        {
            //new block so that failure gotos don't skip over variable initialization
            //std::cerr << "calling callkernel\\n";
            if (callkernel_%(nodename)s(1, 0, dims
            """ % locals()
        for iname in inputs:
            print >> sio, """
                        , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s)
            """ % locals()
        for oname in outputs:
            print >> sio, """
                        , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s)
            """ % locals()
        print >> sio, """
                        ))
            {
                 // error
            """
        for oname in outputs:
            print >> sio, """
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                """ % locals()
        print >> sio, """
                %(fail)s;
            }
            else // no error
            {
            }
        }
        //std::cerr << "C_CODE %(opname)s END\\n";
        """ % locals()
        #print sio.getvalue()
        return sio.getvalue()
Example #28
0
    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
        #inplace version, set set z_out = z_in
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y, b = inputs
        z_out, = outputs
        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO()

        print >> sio, """

        #define REAL float
        float %(name)s_a = (PyArray_TYPE(%(a)s) == NPY_FLOAT)
        ? (REAL)(((float*)%(a)s->data)[0])
        : (REAL)(((double*)%(a)s->data)[0]);

        float %(name)s_b = (PyArray_TYPE(%(b)s) == NPY_FLOAT) ?
        (REAL)(((float*)%(b)s->data)[0])
        : (REAL)(((double*)%(b)s->data)[0]);
        #undef REAL

        if (%(inplace)s
            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
            && ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
                || (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
        {
            // The input has an appropriate layout, we work inplace
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
        }
        else if (%(z_out)s
                && (%(z_out)s->nd == 2)
                && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
                    == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
                && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
                    == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
                && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
                    || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
        {
            // The existing output has an appropriate layout,
            // copy the input data into it, then work inplace
            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
                %(fail)s;
            }
        }
        else
        {
            // Copy the input, use the copy as output
            Py_XDECREF(%(z_out)s);
            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
            if (!%(z_out)s)
            {
                %(fail)s;
            }
        }

        if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s))
        {
            %(fail)s;
        }
        """

        return sio.getvalue() % locals()
Example #29
0
    def c_src_kernel(self, node, nodename, nd):
        sio = StringIO()
        #print 'C_SRC_KERNEL', sio.getvalue()

        for ipos, i in enumerate(node.inputs):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
        print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
            self.scalar_op.__class__.__name__, nodename, nd)
        if (nd):
            print >> sio, "\t,", ", ".join("const int dim%i" % i
                                           for i in xrange(nd))
        #declare inputs
        for ipos, i in enumerate(node.inputs):
            s = ", ".join(["const float * i%i_data" % ipos] +
                          ["int i%i_str_%i" % (ipos, d) for d in xrange(nd)])
            print >> sio, "\t,", s
        #declare outputs
        for ipos, i in enumerate(node.outputs):
            s = ", ".join(["float * o%i_data" % ipos] +
                          ["int o%i_str_%i" % (ipos, d) for d in xrange(nd)])
            print >> sio, "\t,", s
            #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
            #print >> sio, "\t,", "float * o%i_data" % ipos
        print >> sio, "\t)\n{"
        print >> sio, "    const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
        print >> sio, "    const int numThreads = blockDim.x * gridDim.x;"

        # For each input that is a scalar which has been broadcasted to a tensor,
        #     load it into a local variable
        for ipos, i in enumerate(node.inputs):
            if _logical_scalar(i):
                print >> sio, "    const float ii_i%i_value = i%i_data[0];" % (
                    ipos, ipos)

        #loop over the elements to be treated by this kernel call
        print >> sio, "    for (int i = idx; i < numEls; i += numThreads) {"
        # calculate the data pointers for all arguments
        print >> sio, "        int ii = i;"
        for ipos, i in enumerate(node.inputs):
            if not _logical_scalar(i):
                print >> sio, "        const float * ii_i%i_data = i%i_data;" % (
                    ipos, ipos)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "        float * ii_o%i_data = o%i_data;" % (ipos,
                                                                       ipos)
        for d in xrange(nd - 1, -1, -1):
            if d > 0:
                print >> sio, "        int pos%i = ii %% dim%i;" % (d, d)
                print >> sio, "        ii = ii / dim%i;" % d
            else:
                print >> sio, "        int pos%i = ii;" % d

            for ipos, i in enumerate(node.inputs):
                if not _logical_scalar(i):
                    print >> sio, "        ii_i%i_data += pos%i * i%i_str_%i;" % (
                        ipos, d, ipos, d)
            for ipos, i in enumerate(node.outputs):
                print >> sio, "        ii_o%i_data += pos%i * o%i_str_%i;" % (
                    ipos, d, ipos, d)

        # perform the scalar operation on the input and output references
        #TODO: What if the scalar_op needs support_code??
        task_code = self.scalar_op.c_code(
            Apply(self.scalar_op, [
                scalar.Scalar(dtype=input.type.dtype).make_variable()
                for input in node.inputs
            ], [
                scalar.Scalar(dtype=output.type.dtype).make_variable()
                for output in node.outputs
            ]),
            nodename + '_scalar_',
            get_str_list_logical_scalar(node),
            ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)],
            sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
        print >> sio, "       ", task_code
        print >> sio, "    }"

        #indent = " "*(4*d+7)
        #for ipos, i in enumerate(node.inputs):
        #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
        print >> sio, "}"

        #print sio.getvalue()
        return sio.getvalue()
Example #30
0
    def c_src_callkernel(self, node, nodename):
        #
        # This function serves three main goals:
        #
        # The first is stride unpacking:
        # it accepts input and output arguments as
        #    float * , int*
        # pairs, and it constructs a kernel function call where inputs and arguments are named
        # like
        #    float *, int, int, int ...
        #
        # The second is to recognize when any dimensions can be collapsed as
        # being contiguous. That mean that we can merge that dimensions with another
        # one for all inputs/outputs and have the same retusuls (confusing... read code)
        #
        # The thrid is to make a special case for scalar element. We allow the collapsing of them.
        # In the ccontiguous and not contiguous case, we use registers to lower the number of memory access.

        #TODO: make a special case for broadcasting, to store the data in shared memory.

        nd = node.outputs[0].type.ndim
        nb_inputs = len(node.inputs)
        nb_outputs = len(node.outputs)
        d = dict()
        #input_params and output_params go into the function declaration/definition
        input_params = ", ".join(
            "const float * i%i_data, const int * i%i_str" % (ipos, ipos)
            for ipos in xrange(len(node.inputs)))
        output_params = ", ".join("float * o%i_data, const int * o%i_str" %
                                  (ipos, ipos)
                                  for ipos in xrange(len(node.outputs)))

        #input_args and output_args go into the recursive call.
        input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos)
                               for ipos in xrange(len(node.inputs)))
        output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos)
                                for ipos in xrange(len(node.outputs)))

        prod_dims = '*'.join(["dims[%i]" % di for di in xrange(nd)] + ['1'])

        scalar_op = self.scalar_op.__class__.__name__

        sio = StringIO()
        print >> sio, """
        static void can_collapse_%(nodename)s(int nd, const int * dims, const int * strides, int collapse[])
        {
            //can we collapse dims[i] and dims[i-1]
            for(int i=nd-1;i>0;i--){
                if(strides[i]*dims[i]==strides[i-1]){//the dims nd-1 are not strided again dimension nd
                    collapse[i]=1;
                }else collapse[i]=0;
            }
        }
        """ % locals()
        print >> sio, """
        static int callkernel_%(nodename)s(unsigned int numEls, const int d,
            const int * dims,
            %(input_params)s,
            %(output_params)s)
        {
            numEls = %(prod_dims)s;
        """ % locals()
        if self.verbose:
            print >> sio, """
                std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s     w numEls" << numEls << " dims"<< d << "\\n";
            """ % locals()
            print >> sio, 'std::cerr << ' + " << ' ' <<  ".join(
                ['"  "'] + list("dims[%i]" % di
                                for di in xrange(nd)) + ["'\\n';"])
        if self.verbose > 1:
            for ipos in xrange(len(node.inputs)):
                print >> sio, """
                std::cerr << "   %(ipos)s data strides" <<
                """ % locals() + " << ' ' <<  ".join(
                    ["i%s_data" % ipos] +
                    list("i%s_str[%i]" % (ipos, di)
                         for di in xrange(nd))) + ''' << "\\n"; '''

            for ipos in xrange(len(node.outputs)):
                print >> sio, """
                std::cerr << "   %(ipos)s data strides" <<
                """ % locals() + " << ' ' <<  ".join(
                    ["o%s_data" % ipos] +
                    list("o%s_str[%i]" % (ipos, di)
                         for di in xrange(nd))) + ''' << "\\n"; '''
    # collapse dimension that are broadcast in all inputs.
    # need to be done before contiguous collapse as it will break it.
    # do the dimensions and the strides
        if nd > 0:
            print >> sio, "int local_dims[%(nd)s];" % locals()
        else:
            print >> sio, "int *local_dims=NULL;"
        if nb_inputs > 0 and nd > 0:
            print >> sio, """
            int local_str[%(nb_inputs)s][%(nd)s];
            int local_ostr[%(nb_outputs)s][%(nd)s];
            """ % locals()
        else:
            print >> sio, """
            int local_str[1][1];
            int local_ostr[1][1];
            """
        print >> sio, """
        int nd_collapse = %(nd)s;
        for(int i=0;i<%(nd)s;i++){//init new dim
          local_dims[i]=dims[i];
        }
        """ % locals()
        for ipos in xrange(len(node.inputs)):
            print >> sio, """
            for(int i=0;i<%(nd)s;i++){//init new strides
              local_str[%(ipos)s][i]=i%(ipos)s_str[i];
            }
            """ % locals()
        for ipos in xrange(len(node.outputs)):
            print >> sio, """
            for(int i=0;i<%(nd)s;i++){//init new strides
              local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
            }
            """ % locals()
        if self.verbose > 2:
            print >> sio, 'std::cerr <<"before broadcast collapse\\n";'
            print >> sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
            print >> sio, 'std::cerr << "local_dims";'
            for d in xrange(nd):
                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals(
                )
            print >> sio, 'std::cerr << "\\n";'

            for ipos in xrange(len(node.inputs)):
                print >> sio, 'std::cerr << " local_str inputs %(ipos)s: " <<'%locals() + \
                        ' << " " << '.join(["local_str[%s][%s]"% (ipos, x) for x in xrange(nd)])+'<<"\\n";'
            for ipos in xrange(len(node.outputs)):
                print >> sio, 'std::cerr << " local_ostr inputs %(ipos)s: " <<'%locals() + \
                        ' << " " << '.join(["local_ostr[%s][%s]"% (ipos, x) for x in xrange(nd)])+'<<"\\n";'

        print >> sio, """
        for(int id=0;id<nd_collapse;id++){

          bool all_broadcast=true;
          for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
            if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
          }
          for(int input_id=0;input_id<%(nb_outputs)s;input_id++){
            if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
          }
          if(all_broadcast){
            for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array
              local_dims[j-1]=local_dims[j];
            for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
              for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
                local_str[input_id][j-1]=local_str[input_id][j];
              }
            }
            for(int output_id=0;output_id<%(nb_outputs)s;output_id++){
              for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
                local_ostr[output_id][j-1]=local_ostr[output_id][j];
              }
            }
            nd_collapse--; id--;
          }
        }
        """ % locals()

        if self.verbose > 2:
            print >> sio, 'std::cerr <<"after broadcast collapse\\n";'
            print >> sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
            print >> sio, 'std::cerr << "local_dims";'
            for d in xrange(nd):
                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals(
                )
            print >> sio, 'std::cerr << "\\n";'

            for ipos in xrange(len(node.inputs)):
                print >> sio, 'std::cerr << " local_str %(ipos)s: " <<' % locals(
                ) + ' << " " << '.join(
                    ["local_str[%s][%s]" % (ipos, x)
                     for x in xrange(nd)]) + '<<"\\n";'
            for ipos in xrange(len(node.outputs)):
                print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<' % locals(
                ) + ' << " " << '.join(
                    ["local_ostr[%s][%s]" % (ipos, x)
                     for x in xrange(nd)]) + '<<"\\n";'
    # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
    # this is a good idea because we make less index calculation in the gpu.

        if nd > 0:
            print >> sio, "int nd_collapse_[%(nd)s] = {" % locals() + ','.join(
                ['1' for x in xrange(nd)]) + "};"
        else:
            print >> sio, "int *nd_collapse_ = NULL;"
        for ipos in xrange(len(node.inputs)):
            if not _logical_scalar(node.inputs[ipos]):
                if nd > 0:
                    print >> sio, """
                        int nd_collapse_%(ipos)s[%(nd)s] = {""" % locals(
                    ) + ','.join(['1' for x in xrange(nd)]) + "};"
                else:
                    print >> sio, """
                        int *nd_collapse_%(ipos)s = NULL;""" % locals()
                print >> sio, """
can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
for(int i=0;i<nd_collapse;i++){
if(nd_collapse_%(ipos)s[i]==0)
nd_collapse_[i]=0;
}
                """ % locals()
                if self.verbose > 1:
                    print >> sio, """
                    std::cerr<< "nd_collapse_%(ipos)s "<<
                    """ % locals()
                    print >> sio, ' << " " << '.join([
                        "nd_collapse_%s[" % ipos + str(i) + "]"
                        for i in xrange(nd)
                    ])
                    print >> sio, '<< "\\n";'

    # update the local stride.
        for ipos in xrange(len(node.inputs)):
            print >> sio, """
            for(int i=nd_collapse-1;i>0;i--){
              if(nd_collapse_[i]==1){
                local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides
                for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
                  local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
                }
            }
            """ % locals()

        for ipos in xrange(len(node.outputs)):
            print >> sio, """
            for(int i=nd_collapse-1;i>0;i--){
              if(nd_collapse_[i]==1){
                local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides
                for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
                  local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
                }
            }
            """ % locals()

    # update the local dims.
        print >> sio, """
        for(int i=nd_collapse-1;i>0;i--){
          if(nd_collapse_[i]==1){
            local_dims[i-1]*=local_dims[i];//set new dims
            for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array
              local_dims[j-1]=local_dims[j];
          }
        }
        """ % locals()

        #update the new number of dim
        print >> sio, """
        for(int i=1, end=nd_collapse;i<end;i++){
          if(nd_collapse_[i]==1)nd_collapse--;
        }
        if(nd_collapse == 1 """ % locals()
        l = [
            "local_str[%s][nd_collapse-1]==1 " % ipos
            for ipos in xrange(len(node.inputs))
            if not _logical_scalar(node.inputs[ipos])
        ]
        l += [
            "local_ostr[%s][nd_collapse-1]==1 " % ipos
            for ipos in xrange(len(node.outputs))
            if not _logical_scalar(node.outputs[ipos])
        ]
        if len(l) > 0:
            print >> sio, " && ", " && ".join(l)
        print >> sio, """){nd_collapse=0;} """

        if self.verbose:
            print >> sio, 'std::cerr <<"after can_collapse\\n";'
            print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals(
            )
        if self.verbose > 1:
            for d in xrange(nd):
                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals(
                )
            print >> sio, 'std::cerr << "\\n";'

            for ipos in xrange(len(node.inputs)):
                print >> sio, 'std::cerr << " local_str %(ipos)s: " <<' % locals(
                ) + ' << " " << '.join(
                    ["local_str[%s][%s]" % (ipos, x)
                     for x in xrange(nd)]) + '<<"\\n";'
            for ipos in xrange(len(node.outputs)):
                print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<' % locals(
                ) + ' << " " << '.join(
                    ["local_ostr[%s][%s]" % (ipos, x)
                     for x in xrange(nd)]) + '<<"\\n";'

        def launch_Ccontiguous(nodename, scalar_op, sync=True):
            kernel_call_args = ["numEls"]
            for ipos in xrange(len(node.inputs)):
                kernel_call_args.append("i%i_data" % ipos)
            for ipos in xrange(len(node.outputs)):
                kernel_call_args.append("o%i_data" % ipos)
            kernel_call_args = ", ".join(kernel_call_args)
            verb = ""
            if self.verbose:
                verb = 'std::cerr << "   Running ccontiguous version\\n";'
            print >> sio, """
                //first use at least a full warp
                int threads_per_block = std::min(numEls,  (unsigned int)32); //WARP SIZE

                //next start adding multiprocessors
                int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS

                // next start adding more warps per multiprocessor
                if (threads_per_block * n_blocks < numEls)
                    threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
                kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);

                //std::cerr << "calling callkernel returned\\n";
                """ % locals()
            if sync:
                print >> sio, """
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if( cudaSuccess != err)
                {
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n    n_blocks=%%i threads_per_block=%%i\\n   Call: %%s\\n",
                         "GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err),
                         n_blocks, threads_per_block,
                         "kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
                    return -1;

                }
                %(verb)s
                return 0;
                """ % locals()
            else:
                print >> sio, " return 0; " % locals()

        def launch_General(nodename, scalar_op, force_nd, sync=True):
            # kernel_call_args are used to invoke the cuda kernel
            local = "local_"
            kernel_call_args = ["numEls"]
            kernel_call_args.extend(local + "dims[%i]" % di
                                    for di in xrange(force_nd))
            for ipos in xrange(len(node.inputs)):
                kernel_call_args += ["i%i_data" % ipos] + list(
                    local + "str[%i][%i]" % (ipos, di)
                    for di in xrange(force_nd))
                #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
                #kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
            for ipos in xrange(len(node.outputs)):
                kernel_call_args += ["o%i_data" % ipos] + list(
                    local + "ostr[%i][%i]" % (ipos, di)
                    for di in xrange(force_nd))
                #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
                #kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
            if self.verbose:
                print >> sio, """
                    std::cerr << "   Running general version with %(force_nd)s  dims\\n";
                    """ % locals()
                print >> sio, "std::cerr << " + ' << " " << '.join(
                    kernel_call_args) + ' << "\\n";'
                #std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;

            kernel_call_args = ", ".join(kernel_call_args)

            print >> sio, """
                //first use at least a full warp
                int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE

                //next start adding multiprocessors
                int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS

                // next start adding more warps per multiprocessor
                if (threads_per_block * n_blocks < numEls)
                    threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);

                kernel_%(scalar_op)s_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
                """ % locals()
            if sync:
                print >> sio, """
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if( cudaSuccess != err)
                {
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n    n_blocks=%%i threads_per_block=%%i\\n   Call: %%s\\n",
                         "GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err),
                         n_blocks, threads_per_block,
                         "kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
                    return -1;

                }
                return 0;
                """ % locals()
            else:
                print >> sio, " return 0; " % locals()

        print >> sio, "if(numEls==0) return 0;"
        print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {" % locals(
        )
        print >> sio, "case 0: {"
        launch_Ccontiguous(nodename, scalar_op, self.sync)
        print >> sio, "        } break;"
        for i in xrange(1, nd + 1):
            print >> sio, "case " + str(i) + ": {"
            launch_General(nodename, scalar_op, i, self.sync)
            print >> sio, "        } break;"

        print >> sio, "}"  #end case
        print >> sio, "return -2;"  # should not get to this point
        print >> sio, "}"  #end fct

        #N.B. cudaGetLastError is called by c_code
        return sio.getvalue()
Example #31
0
    def c_src_kernel_tiling_less_registers(self, node, nodename):
        """ The kernel applies to problems with <= 5 dimensions """

        nd = node.outputs[0].type.ndim
        n_in = len(node.inputs)
        n_out = len(node.outputs)
        sio = StringIO()

        if nd not in (2, ):
            return sio.getvalue()

        # print some leading comments to make the code easier to read
        for ipos, i in enumerate(node.inputs):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
        print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
            self.scalar_op.__class__.__name__, nodename,
            'tiling%i_less_registers' % nd)
        if (nd):
            print >> sio, "\t,", ", ".join("const int dim%i" % i
                                           for i in xrange(nd))
        #declare inputs
        for ipos, i in enumerate(node.inputs):
            s = ", ".join(["const float * i%i_data_0" % ipos] +
                          list("int i%i_str_%i" % (ipos, d)
                               for d in xrange(nd)))
            print >> sio, "\t,", s
        #declare outputs
        for ipos, i in enumerate(node.outputs):
            s = ", ".join(["float * o%i_data_0" % ipos] +
                          list("int o%i_str_%i" % (ipos, d)
                               for d in xrange(nd)))
            print >> sio, "\t,", s
            #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
            #print >> sio, "\t,", "float * o%i_data" % ipos
        print >> sio, "\t)\n{"

        # TODO: Setting these to true makes the function fail SOMETIMES.  I don't know why yet.
        use_shared_stride = False
        use_shared_limits = False

        def decl_limits(nd):
            if use_shared_limits:
                print >> sio, "__shared__ float * limits[%(nd)s];" % locals()

        def stride(io, p, d):
            if use_shared_stride:
                return "s%s_str[%i][%i]" % (io, p, d)
            else:
                return "%s%i_str_%i" % (io, p, d)

        def limits(d):
            if use_shared_limits:
                return "limits[%i]" % d
            else:
                return "limits%i" % d

        def decl_shared_stride(nin, nout, nd):
            if not use_shared_stride:
                return
            print >> sio, """
            __shared__ int si_str[%(nin)s][%(nd)s];
            __shared__ int so_str[%(nout)s][%(nd)s];
            if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
            """ % locals()
            for i in xrange(nin):
                for d in xrange(nd):
                    print >> sio, "si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals(
                    )
            for i in xrange(n_out):
                for d in xrange(nd):
                    print >> sio, "so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals(
                    )
            print >> sio, "} __syncthreads();"

        def calc_limit(d):
            s = stride('o', 0, d)
            lname = limits(d)
            if use_shared_limits:
                print >> sio, "if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
                if d == 0:
                    print >> sio, "%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(
                    )
                else:
                    dm1 = d - 1
                    print >> sio, "%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(
                    )
                print >> sio, "} __syncthreads();"
            else:
                if d == 0:
                    print >> sio, "const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals(
                    )
                else:
                    dm1 = d - 1
                    print >> sio, "const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals(
                    )

        def decl_ptrs(d, offset):
            dm1 = d - 1
            assert dm1 >= 0
            for i in xrange(n_in):
                s = stride('i', i, d)
                print >> sio, "const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(
                )
            for i in xrange(n_out):
                s = stride('o', i, d)
                print >> sio, "float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals(
                )

        def inc_ptrs(d, amt):
            for i in xrange(n_in):
                s = stride('i', i, d)
                print >> sio, "i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(
                )
            for i in xrange(n_out):
                s = stride('o', i, d)
                print >> sio, "o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals(
                )

        def while_limit(d):
            lname = limits(d)
            print >> sio, "while (o0_data_%(d)s < %(lname)s) { " % locals()

        def end_while(d):
            print >> sio, "}"

        def task_code(d):
            print >> sio, self.scalar_op.c_code(
                Apply(self.scalar_op, [
                    scalar.Scalar(dtype=input.type.dtype).make_variable()
                    for input in node.inputs
                ], [
                    scalar.Scalar(dtype=output.type.dtype).make_variable()
                    for output in node.outputs
                ]),
                nodename + '_scalar_', [
                    'i%i_data_%i[0]' % (ipos, d)
                    for ipos, i in enumerate(node.inputs)
                ], [
                    'o%i_data_%i[0]' % (ipos, d)
                    for ipos, i in enumerate(node.outputs)
                ],
                sub=dict(fail='return;'))  #TODO: set a failure code somehow!!!

        if nd == 4:
            decl_shared_stride(n_in, n_out, nd)
            decl_limits(nd)
            calc_limit(0)
            inc_ptrs(0, 'blockIdx.x')
            while_limit(0)
            if 1:
                calc_limit(1)
                decl_ptrs(1, 'blockIdx.y')
                while_limit(1)
                if 1:
                    calc_limit(2)
                    decl_ptrs(2, 'threadIdx.y')
                    while_limit(2)
                    if 1:
                        calc_limit(3)
                        decl_ptrs(3, 'threadIdx.x')
                        while_limit(3)
                        if 1:
                            task_code(3)
                            inc_ptrs(3, 'blockDim.x')
                        end_while(3)
                        inc_ptrs(2, 'blockDim.y')
                    end_while(2)
                    inc_ptrs(1, 'gridDim.y')
                end_while(1)
                inc_ptrs(0, 'gridDim.x')
            end_while(0)

        print >> sio, "}"
        print sio.getvalue()
        return sio.getvalue()
Example #32
0
    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
        #inplace version, set set z_out = z_in
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y = inputs
        z_out, = outputs
        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO()

        print >> sio, """
        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];

        if (%(inplace)s
            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
            && ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
                || (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
        {
            // The input has an appropriate layout, we work inplace
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
        }
        else if (%(z_out)s
                && (%(z_out)s->nd == 2)
                && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
                    == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
                && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
                    == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
                && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
                    || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
        {
            // The existing output has an appropriate layout,
            // copy the input data into it, then work inplace
            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
                %(fail)s;
            }
        }
        else
        {
            // Copy the input, use the copy as output
            Py_XDECREF(%(z_out)s);
            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
            if (!%(z_out)s)
            {
                %(fail)s;
            }
        }

        if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
        {
            %(fail)s;
        }
        """
        return sio.getvalue() % locals()
Example #33
0
    def c_code(self, node, name, inputs, outputs, sub):
        #z_out = alpha * dot(x,y) + beta * z_in
        #inplace version, set set z_out = z_in
        #not inplace version, we copy z_in to z_out.
        z_in, a, x, y = inputs
        z_out, = outputs
        inplace = int(self.inplace)
        fail = sub['fail']
        sio = StringIO()

        print >> sio, """
        float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0];

        if (%(inplace)s
            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0)
            && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0)
            && ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1)
                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1)
                || (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1)
                || (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1)))
        {
            // The input has an appropriate layout, we work inplace
            Py_XDECREF(%(z_out)s);
            %(z_out)s = %(z_in)s;
            Py_INCREF(%(z_out)s);
        }
        else if (%(z_out)s
                && (%(z_out)s->nd == 2)
                && (CudaNdarray_HOST_DIMS(%(z_out)s)[0]
                    == CudaNdarray_HOST_DIMS(%(z_in)s)[0])
                && (CudaNdarray_HOST_DIMS(%(z_out)s)[1]
                    == CudaNdarray_HOST_DIMS(%(z_in)s)[1])
                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0)
                && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0)
                && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1)
                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1)
                    || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1)
                    || (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1)))
        {
            // The existing output has an appropriate layout,
            // copy the input data into it, then work inplace
            if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s))
            {
                %(fail)s;
            }
        }
        else
        {
            // Copy the input, use the copy as output
            Py_XDECREF(%(z_out)s);
            %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s);
            if (!%(z_out)s)
            {
                %(fail)s;
            }
        }

        if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s))
        {
            %(fail)s;
        }
        """
        return sio.getvalue() % locals()
Example #34
0
    def c_src_kernel_tiling(self, node, nodename):
        """ The kernel applies to problems with <= 5 dimensions """

        #The kernel is intended to be structured roughly like this:
        """
        static __global__ void kernel()
        {
            for (int v = blockIdx.y; v < dim0; v += gridDim.x)
            {
                for (int w = blockIdx.y; w < dim1; w += gridDim.y)
                {
                    for (int x = threadIdx.x; x < dim2; x += blockDim.x)
                    {
                        for (int y = threadIdx.y; y < dim3; y += blockDim.y)
                        {
                            for (int z = threadIdx.z; z < dim4; z += blockDim.z)
                            {
                                out[v * out_stride[0] + ...] = f(in1[...],  in2[...])
                            }
                        }
                    }
                }
            }
        }

        """

        nd = node.outputs[0].type.ndim
        sio = StringIO()
        #print 'C_SRC_KERNEL', sio.getvalue()

        if nd in (4, ):
            # print some leading comments to make the code easier to read
            for ipos, i in enumerate(node.inputs):
                print >> sio, "//    Input  ", ipos, str(i.type)
            for ipos, i in enumerate(node.outputs):
                print >> sio, "//    Output ", ipos, str(i.type)
            print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
                self.scalar_op.__class__.__name__, nodename, 'tiling%i' % nd)
            if (nd):
                print >> sio, "\t,", ", ".join("const int dim%i" % i
                                               for i in xrange(nd))
            #declare inputs
            for ipos, i in enumerate(node.inputs):
                s = ", ".join(["const float * i%i_data" % ipos] +
                              list("int i%i_str_%i" % (ipos, d)
                                   for d in xrange(nd)))
                print >> sio, "\t,", s
            #declare outputs
            for ipos, i in enumerate(node.outputs):
                s = ", ".join(["float * o%i_data" % ipos] +
                              list("int o%i_str_%i" % (ipos, d)
                                   for d in xrange(nd)))
                print >> sio, "\t,", s
                #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
                #print >> sio, "\t,", "float * o%i_data" % ipos
            print >> sio, "\t)\n{"

            # For each input that is a scalar which has been broadcasted to a tensor,
            #     load it into a local variable
            print >> sio, "    __shared__ float value0[%i];" % len(node.inputs)
            print >> sio, "    __shared__ int shared_dims[%(nd)s];" % locals()
            #print >> sio, "    __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
            print >> sio, "    if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
            for ipos, i in enumerate(node.inputs):
                if _logical_scalar(i):
                    print >> sio, "    value0[%i] = i%i_data[0];" % (ipos,
                                                                     ipos)
            for ipos in xrange(nd):
                print >> sio, "    shared_dims[%i] = dim%i;" % (ipos, ipos)
            print >> sio, "    }"
            print >> sio, "    __syncthreads();"

            if (nd == 4):
                print >> sio, """
                for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x)
                {
                    for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y)
                    {
                        //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x)
                        for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y)
                        {
                            //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y)
                            for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x)
                            {
                """
            else:
                raise NotImplementedError()

            for ipos, i in enumerate(node.inputs):
                if not _logical_scalar(i):
                    print >> sio, "        const float * ii_i%i_data = i%i_data;" % (
                        ipos, ipos)
            for ipos, i in enumerate(node.outputs):
                print >> sio, "        float * ii_o%i_data = o%i_data;" % (
                    ipos, ipos)
            for d in xrange(nd):
                for ipos, i in enumerate(node.inputs):
                    if not _logical_scalar(i):
                        print >> sio, "        ii_i%i_data += pos%i * i%i_str_%i;" % (
                            ipos, d, ipos, d)
                for ipos, i in enumerate(node.outputs):
                    print >> sio, "        ii_o%i_data += pos%i * o%i_str_%i;" % (
                        ipos, d, ipos, d)

            # perform the scalar operation on the input and output references
            #TODO: What if the scalar_op needs support_code??
            task_code = self.scalar_op.c_code(
                Apply(self.scalar_op, [
                    scalar.Scalar(dtype=input.type.dtype).make_variable()
                    for input in node.inputs
                ], [
                    scalar.Scalar(dtype=output.type.dtype).make_variable()
                    for output in node.outputs
                ]),
                nodename + '_scalar_',
                get_str_list_logical_scalar(node, value_str='value0[%i]'), [
                    'ii_o%i_data[0]' % ipos
                    for ipos, i in enumerate(node.outputs)
                ],
                sub=dict(fail='return;'))  #TODO: set a failure code somehow!!!
            print >> sio, "       ", task_code

            print >> sio, "    }" * nd

            #TODO: insert runtime stride checks that select the best loop order either here, or in
            # the host code that launched the  kernel (host code probably better spot)

            #indent = " "*(4*d+7)
            #for ipos, i in enumerate(node.inputs):
            #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
            print >> sio, "}"

        print sio.getvalue()
        return sio.getvalue()
Example #35
0
    def c_src_kernel_tiling(self, node, nodename):
        """ The kernel applies to problems with <= 5 dimensions """

        # The kernel is intended to be structured roughly like this:
        """
        static __global__ void kernel()
        {
            for (int v = blockIdx.y; v < dim0; v += gridDim.x)
            {
                for (int w = blockIdx.y; w < dim1; w += gridDim.y)
                {
                    for (int x = threadIdx.x; x < dim2; x += blockDim.x)
                    {
                        for (int y = threadIdx.y; y < dim3; y += blockDim.y)
                        {
                            for (int z = threadIdx.z; z < dim4; z += blockDim.z)
                            {
                                out[v * out_stride[0] + ...] = f(in1[...],  in2[...])
                            }
                        }
                    }
                }
            }
        }

        """

        nd = node.outputs[0].type.ndim
        sio = StringIO()
        # print 'C_SRC_KERNEL', sio.getvalue()

        if nd in (4,):
            # print some leading comments to make the code easier to read
            for ipos, i in enumerate(node.inputs):
                print >> sio, "//    Input  ", ipos, str(i.type)
            for ipos, i in enumerate(node.outputs):
                print >> sio, "//    Output ", ipos, str(i.type)
            print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
                    self.scalar_op.__class__.__name__,
                    nodename,
                    'tiling%i'%nd)
            if (nd):
                print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
            # declare inputs
            for ipos, i in enumerate(node.inputs):
                s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
                print >> sio, "\t,", s
            # declare outputs
            for ipos, i in enumerate(node.outputs):
                s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
                print >> sio, "\t,", s
                #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
                #print >> sio, "\t,", "float * o%i_data" % ipos
            print >> sio, "\t)\n{"

            # For each input that is a scalar which has been broadcasted to a tensor,
            #     load it into a local variable
            print >> sio, "    __shared__ float value0[%i];" % len(node.inputs)
            print >> sio, "    __shared__ int shared_dims[%(nd)s];" % locals()
            #print >> sio, "    __shared__ int shared_i_str[%(n_in)s][%(nd)s]"
            print >> sio, "    if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
            for ipos, i in enumerate(node.inputs):
                if _logical_scalar(i):
                    print >> sio, "    value0[%i] = i%i_data[0];" % (ipos, ipos)
            for ipos in xrange(nd):
                print >> sio, "    shared_dims[%i] = dim%i;" % (ipos, ipos)
            print >> sio, "    }"
            print >> sio, "    __syncthreads();"

            if (nd == 4):
                print >> sio, """
                for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x)
                {
                    for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y)
                    {
                        //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x)
                        for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y)
                        {
                            //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y)
                            for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x)
                            {
                """
            else:
                raise NotImplementedError()

            for ipos, i in enumerate(node.inputs):
                if not _logical_scalar(i):
                    print >> sio, "        const float * ii_i%i_data = i%i_data;" % (ipos, ipos)
            for ipos, i in enumerate(node.outputs):
                print >> sio, "        float * ii_o%i_data = o%i_data;" % (ipos, ipos)
            for d in xrange(nd):
                for ipos, i in enumerate(node.inputs):
                    if not _logical_scalar(i):
                        print >> sio, "        ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)
                for ipos, i in enumerate(node.outputs):
                    print >> sio, "        ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)

            # perform the scalar operation on the input and output references
            # TODO: What if the scalar_op needs support_code??
            task_code = self.scalar_op.c_code(
                    Apply(self.scalar_op,
                        [scalar.Scalar(dtype=input.type.dtype).make_variable()
                         for input in node.inputs],
                        [scalar.Scalar(dtype=output.type.dtype).make_variable()
                         for output in node.outputs])
                    , nodename + '_scalar_'
                    , get_str_list_logical_scalar(node, value_str='value0[%i]')
                    , ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)]
                    , sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
            print >> sio, "       ", task_code

            print >> sio, "    }" * nd

            # TODO: insert runtime stride checks that select the best loop order either here, or in
            # the host code that launched the  kernel (host code probably better spot)

            #indent = " "*(4*d+7)
            # for ipos, i in enumerate(node.inputs):
                #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
            print >> sio, "}"

        print sio.getvalue()
        return sio.getvalue()
Example #36
0
    def c_extract(self, name, sub, check_input=True):
        sio = StringIO()
        fail = sub['fail']
        nd = self.ndim
        print >> sio, """
        assert(py_%(name)s->ob_refcnt >= 2); // There should be at least one ref from the container object,
        // and one ref from the local scope.

        if (CudaNdarray_Check(py_%(name)s))
        {
            //fprintf(stderr, "c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
            %(name)s = (CudaNdarray*)py_%(name)s;
            //std::cerr << "c_extract " << %(name)s << '\\n';
        """ % locals()
        if(check_input):
            print >> sio, """
                if (%(name)s->nd != %(nd)s)
                {
                    PyErr_Format(PyExc_RuntimeError,
                                 "c_extract: Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s",
                                 %(name)s->nd);
                    %(name)s = NULL;
                    %(fail)s;
                }
                //std::cerr << "c_extract " << %(name)s << " nd check passed\\n";
            """ % locals()
            for i, b in enumerate(self.broadcastable):
                if b:
                    print >> sio, """
                if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1)
                {
                    PyErr_Format(PyExc_RuntimeError,
                                 "c_extract: Some CudaNdarray has dim %%i on broadcastable dimension %%i",
                                 CudaNdarray_HOST_DIMS(%(name)s)[%(i)s], %(i)s);
                    %(name)s = NULL;
                    %(fail)s;
                }
                //std::cerr << "c_extract " << %(name)s << "dim check %(i)s passed\\n";
                //std::cerr << "c_extract " << %(name)s << "checking bcast %(i)s <" << %(name)s->str<< ">\\n";
                //std::cerr << "c_extract " << %(name)s->str[%(i)s] << "\\n";
                if (CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s])
                {
                    //std::cerr << "c_extract bad stride detected...\\n";
                    PyErr_Format(PyExc_RuntimeError,
                                 "c_extract: Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i",
                                 CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s], %(i)s);
                    %(name)s = NULL;
                    %(fail)s;
                }
                //std::cerr << "c_extract " << %(name)s << "bcast check %(i)s passed\\n";
                    """ % locals()
            print >> sio, """
                assert(%(name)s);
                Py_INCREF(py_%(name)s);
            }
            else if (py_%(name)s == Py_None)
            {
                PyErr_SetString(PyExc_TypeError,
                                "expected a CudaNdarray, not None");
                %(name)s = NULL;
                %(fail)s;
            }
            else
            {
                //fprintf(stderr, "FAILING c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt));
                PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray");
                %(name)s = NULL;
                %(fail)s;
            }
            //std::cerr << "c_extract done " << %(name)s << '\\n';
            """ % locals()
        else:
            print >> sio, """
                assert(%(name)s);
                Py_INCREF(py_%(name)s);
            }
            """ % locals()
        #print sio.getvalue()
        return sio.getvalue()
Example #37
0
    def c_src_kernel_tiling_less_registers(self, node, nodename):
        """ The kernel applies to problems with <= 5 dimensions """

        nd = node.outputs[0].type.ndim
        n_in = len(node.inputs)
        n_out = len(node.outputs)
        sio = StringIO()

        if nd not in (2,):
            return sio.getvalue()

        # print some leading comments to make the code easier to read
        for ipos, i in enumerate(node.inputs):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
        print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
                self.scalar_op.__class__.__name__,
                nodename,
                'tiling%i_less_registers'%nd)
        if (nd):
            print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd))
        # declare inputs
        for ipos, i in enumerate(node.inputs):
            s = ", ".join(["const float * i%i_data_0" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd)))
            print >> sio, "\t,", s
        # declare outputs
        for ipos, i in enumerate(node.outputs):
            s = ", ".join(["float * o%i_data_0" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd)))
            print >> sio, "\t,", s
            #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
            #print >> sio, "\t,", "float * o%i_data" % ipos
        print >> sio, "\t)\n{"

        # TODO: Setting these to true makes the function fail SOMETIMES.  I don't know why yet.
        use_shared_stride = False
        use_shared_limits = False

        def decl_limits(nd):
            if use_shared_limits:
                print >> sio, "__shared__ float * limits[%(nd)s];" % locals()

        def stride(io, p, d):
            if use_shared_stride:
                return "s%s_str[%i][%i]" % (io, p, d)
            else:
                return "%s%i_str_%i" % (io, p, d)
        def limits(d):
            if use_shared_limits:
                return "limits[%i]" % d
            else:
                return "limits%i" % d

        def decl_shared_stride(nin, nout, nd):
            if not use_shared_stride:
                return
            print >> sio, """
            __shared__ int si_str[%(nin)s][%(nd)s];
            __shared__ int so_str[%(nout)s][%(nd)s];
            if ((threadIdx.x == 0) && (threadIdx.y == 0)) {
            """ % locals()
            for i in xrange(nin):
                for d in xrange(nd):
                    print >> sio, "si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals()
            for i in xrange(n_out):
                for d in xrange(nd):
                    print >> sio, "so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals()
            print >> sio, "} __syncthreads();"

        def calc_limit(d):
            s = stride('o', 0, d)
            lname = limits(d)
            if use_shared_limits:
                print >> sio, "if ((threadIdx.x == 0) && (threadIdx.y == 0)) {"
                if d == 0:
                    print >> sio, "%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals()
                else:
                    dm1 = d - 1
                    print >> sio, "%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals()
                print >> sio, "} __syncthreads();"
            else:
                if d == 0:
                    print >> sio, "const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals()
                else:
                    dm1 = d - 1
                    print >> sio, "const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals()

        def decl_ptrs(d, offset):
            dm1 = d - 1
            assert dm1 >= 0
            for i in xrange(n_in):
                s = stride('i', i, d)
                print >> sio, "const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals()
            for i in xrange(n_out):
                s = stride('o', i, d)
                print >> sio, "float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals()

        def inc_ptrs(d, amt):
            for i in xrange(n_in):
                s = stride('i', i, d)
                print >> sio, "i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals()
            for i in xrange(n_out):
                s = stride('o', i, d)
                print >> sio, "o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals()

        def while_limit(d):
            lname = limits(d)
            print >> sio, "while (o0_data_%(d)s < %(lname)s) { " % locals()

        def end_while(d):
            print >> sio, "}"

        def task_code(d):
            print >> sio, self.scalar_op.c_code(
                Apply(self.scalar_op,
                    [scalar.Scalar(dtype=input.type.dtype).make_variable()
                     for input in node.inputs],
                    [scalar.Scalar(dtype=output.type.dtype).make_variable()
                     for output in node.outputs])
                , nodename + '_scalar_'
                , ['i%i_data_%i[0]'%(ipos, d) for ipos, i in enumerate(node.inputs)]
                , ['o%i_data_%i[0]'%(ipos, d) for ipos, i in enumerate(node.outputs)]
                , sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!

        if nd == 4:
            decl_shared_stride(n_in, n_out, nd)
            decl_limits(nd)
            calc_limit(0)
            inc_ptrs(0, 'blockIdx.x')
            while_limit(0)
            if 1:
                calc_limit(1)
                decl_ptrs(1, 'blockIdx.y')
                while_limit(1)
                if 1:
                    calc_limit(2)
                    decl_ptrs(2, 'threadIdx.y')
                    while_limit(2)
                    if 1:
                        calc_limit(3)
                        decl_ptrs(3, 'threadIdx.x')
                        while_limit(3)
                        if 1:
                            task_code(3)
                            inc_ptrs(3, 'blockDim.x')
                        end_while(3)
                        inc_ptrs(2, 'blockDim.y')
                    end_while(2)
                    inc_ptrs(1, 'gridDim.y')
                end_while(1)
                inc_ptrs(0, 'gridDim.x')
            end_while(0)

        print >> sio, "}"
        print sio.getvalue()
        return sio.getvalue()
Example #38
0
 def __str__(self):
     sio = StringIO()
     _config_print(self.__class__, sio)
     return sio.getvalue()
def test_debugprint():
    A = tensor.matrix(name='A')
    B = tensor.matrix(name='B')
    C = A + B
    C.name = 'C'
    D = tensor.matrix(name='D')
    E = tensor.matrix(name='E')

    F = D + E
    G = C + F

    # just test that it work
    debugprint(G)

    # test ids=int
    s = StringIO()
    debugprint(G, file=s, ids='int')
    s = s.getvalue()
    # The additional white space are needed!
    reference = """Elemwise{add,no_inplace} [@0] ''   
 |Elemwise{add,no_inplace} [@1] 'C'   
 | |A [@2]
 | |B [@3]
 |Elemwise{add,no_inplace} [@4] ''   
   |D [@5]
   |E [@6]
"""

    if s != reference:
        print '--' + s + '--'
        print '--' + reference + '--'

    assert s == reference

    # test ids=CHAR
    s = StringIO()
    debugprint(G, file=s, ids='CHAR')
    s = s.getvalue()
    # The additional white space are needed!
    reference = """Elemwise{add,no_inplace} [@A] ''   
 |Elemwise{add,no_inplace} [@B] 'C'   
 | |A [@C]
 | |B [@D]
 |Elemwise{add,no_inplace} [@E] ''   
   |D [@F]
   |E [@G]
"""

    if s != reference:
        print '--' + s + '--'
        print '--' + reference + '--'

    assert s == reference

    # test ids=CHAR, stop_on_name=True
    s = StringIO()
    debugprint(G, file=s, ids='CHAR', stop_on_name=True)
    s = s.getvalue()
    # The additional white space are needed!
    reference = """Elemwise{add,no_inplace} [@A] ''   
 |Elemwise{add,no_inplace} [@B] 'C'   
 |Elemwise{add,no_inplace} [@C] ''   
   |D [@D]
   |E [@E]
"""

    if s != reference:
        print '--' + s + '--'
        print '--' + reference + '--'

    assert s == reference

    # test ids=
    s = StringIO()
    debugprint(G, file=s, ids='')
    s = s.getvalue()
    # The additional white space are needed!
    reference = """Elemwise{add,no_inplace}  ''   
 |Elemwise{add,no_inplace}  'C'   
 | |A 
 | |B 
 |Elemwise{add,no_inplace}  ''   
   |D 
   |E 
"""
    if s != reference:
        print '--' + s + '--'
        print '--' + reference + '--'

    assert s == reference
Example #40
0
 def __str__(self):
     sio = StringIO()
     self.print_summary(sio)
     return sio.getvalue()
Example #41
0
    def c_src_callkernel(self, node, nodename):
        #
        # This function serves three main goals:
        #
        # The first is stride unpacking:
        # it accepts input and output arguments as
        #    float * , int*
        # pairs, and it constructs a kernel function call where inputs and arguments are named
        # like
        #    float *, int, int, int ...
        #
        # The second is to recognize when any dimensions can be collapsed as
        # being contiguous. That mean that we can merge that dimensions with another
        # one for all inputs/outputs and have the same retusuls (confusing... read code)
        #
        # The thrid is to make a special case for scalar element. We allow the collapsing of them.
        # In the ccontiguous and not contiguous case, we use registers to lower the number of memory access.

        # TODO: make a special case for broadcasting, to store the data in shared memory.

        nd = node.outputs[0].type.ndim
        nb_inputs = len(node.inputs)
        nb_outputs = len(node.outputs)
        d = dict()
        # input_params and output_params go into the function declaration/definition
        input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos)
                for ipos in xrange(len(node.inputs)))
        output_params = ", ".join("float * o%i_data, const int * o%i_str"%(ipos, ipos)
                for ipos in xrange(len(node.outputs)))

        # input_args and output_args go into the recursive call.
        input_args = ", ".join("i%i_data, i%i_str"%(ipos, ipos)
                for ipos in xrange(len(node.inputs)))
        output_args = ", ".join("o%i_data, o%i_str"%(ipos, ipos)
                for ipos in xrange(len(node.outputs)))

        prod_dims = '*'.join(["dims[%i]"%di for di in xrange(nd)]+['1'])

        scalar_op = self.scalar_op.__class__.__name__

        sio = StringIO()
        print >> sio, """
        static void can_collapse_%(nodename)s(int nd, const int * dims, const int * strides, int collapse[])
        {
            //can we collapse dims[i] and dims[i-1]
            for(int i=nd-1;i>0;i--){
                if(strides[i]*dims[i]==strides[i-1]){//the dims nd-1 are not strided again dimension nd
                    collapse[i]=1;
                }else collapse[i]=0;
            }
        }
        """ % locals()
        print >> sio, """
        static int callkernel_%(nodename)s(unsigned int numEls, const int d,
            const int * dims,
            %(input_params)s,
            %(output_params)s)
        {
            numEls = %(prod_dims)s;
        """ % locals()
        if self.verbose:
            print >> sio, """
                std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s     w numEls" << numEls << " dims"<< d << "\\n";
            """ % locals()
            print >> sio, 'std::cerr << ' + " << ' ' <<  ".join(['"  "']+list("dims[%i]"%di
                for di in xrange(nd)) + ["'\\n';"])
        if self.verbose > 1:
            for ipos in xrange(len(node.inputs)):
                print >> sio, """
                std::cerr << "   %(ipos)s data strides" <<
                """ % locals() + " << ' ' <<  ".join(["i%s_data"%ipos]
                + list("i%s_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; '''

            for ipos in xrange(len(node.outputs)):
                print >> sio, """
                std::cerr << "   %(ipos)s data strides" <<
                """ % locals() + " << ' ' <<  ".join(["o%s_data"%ipos]
                    + list("o%s_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; '''
    # collapse dimension that are broadcast in all inputs.
    # need to be done before contiguous collapse as it will break it.
    # do the dimensions and the strides
        if nd > 0:
            print >> sio, "int local_dims[%(nd)s];" % locals()
        else:
            print >> sio, "int *local_dims=NULL;"
        if nb_inputs > 0 and nd > 0:
            print >> sio, """
            int local_str[%(nb_inputs)s][%(nd)s];
            int local_ostr[%(nb_outputs)s][%(nd)s];
            """ % locals()
        else:
            print >> sio, """
            int local_str[1][1];
            int local_ostr[1][1];
            """
        print >> sio, """
        int nd_collapse = %(nd)s;
        for(int i=0;i<%(nd)s;i++){//init new dim
          local_dims[i]=dims[i];
        }
        """ % locals()
        for ipos in xrange(len(node.inputs)):
            print >> sio, """
            for(int i=0;i<%(nd)s;i++){//init new strides
              local_str[%(ipos)s][i]=i%(ipos)s_str[i];
            }
            """ % locals()
        for ipos in xrange(len(node.outputs)):
            print >> sio, """
            for(int i=0;i<%(nd)s;i++){//init new strides
              local_ostr[%(ipos)s][i]=o%(ipos)s_str[i];
            }
            """ % locals()
        if self.verbose > 2:
            print >>sio, 'std::cerr <<"before broadcast collapse\\n";'
            print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
            print >> sio, 'std::cerr << "local_dims";'
            for d in xrange(nd):
                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals()
            print >> sio, 'std::cerr << "\\n";'
            if nd > 0:
                for ipos in xrange(len(node.inputs)):
                    print >> sio, 'std::cerr << " local_str inputs %(ipos)s: " <<'%locals() + \
                        ' << " " << '.join(["local_str[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";'
                    for ipos in xrange(len(node.outputs)):
                        print >> sio, 'std::cerr << " local_ostr inputs %(ipos)s: " <<'%locals() + \
                        ' << " " << '.join(["local_ostr[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";'

        print >> sio, """
        for(int id=0;id<nd_collapse;id++){

          bool all_broadcast=true;
          for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
            if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
          }
          for(int input_id=0;input_id<%(nb_outputs)s;input_id++){
            if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false;
          }
          if(all_broadcast){
            for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array
              local_dims[j-1]=local_dims[j];
            for(int input_id=0;input_id<%(nb_inputs)s;input_id++){
              for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
                local_str[input_id][j-1]=local_str[input_id][j];
              }
            }
            for(int output_id=0;output_id<%(nb_outputs)s;output_id++){
              for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array
                local_ostr[output_id][j-1]=local_ostr[output_id][j];
              }
            }
            nd_collapse--; id--;
          }
        }
        """%locals()

        if self.verbose > 2:
            print >>sio, 'std::cerr <<"after broadcast collapse\\n";'
            print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; '
            print >> sio, 'std::cerr << "local_dims";'
            for d in xrange(nd):
                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals()
            print >> sio, 'std::cerr << "\\n";'
            if nd > 0:
                for ipos in xrange(len(node.inputs)):
                    print >> sio, 'std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";'
                    for ipos in xrange(len(node.outputs)):
                        print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";'
    # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle))
    # this is a good idea because we make less index calculation in the gpu.

        if nd > 0:
            print >> sio, "int nd_collapse_[%(nd)s] = {"%locals() + ','.join(['1' for x in xrange(nd)]) + "};"
        else:
            print >> sio, "int *nd_collapse_ = NULL;"
        for ipos in xrange(len(node.inputs)):
            if not _logical_scalar(node.inputs[ipos]):
                if nd > 0:
                    print >> sio, """
                        int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() + ','.join(['1' for x in xrange(nd)]) + "};"
                else:
                    print >> sio, """
                        int *nd_collapse_%(ipos)s = NULL;"""%locals()
                print >> sio, """
can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s);
for(int i=0;i<nd_collapse;i++){
if(nd_collapse_%(ipos)s[i]==0)
nd_collapse_[i]=0;
}
                """ % locals()
                if self.verbose > 1:
                    print >>sio, """
                    std::cerr<< "nd_collapse_%(ipos)s "<<
                    """%locals()
                    print >>sio, ' << " " << '.join(["nd_collapse_%s[" % ipos + str(i)+"]" for i in xrange(nd)])
                    print >>sio, '<< "\\n";'

    # update the local stride.
        for ipos in xrange(len(node.inputs)):
            print >> sio, """
            for(int i=nd_collapse-1;i>0;i--){
              if(nd_collapse_[i]==1){
                local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides
                for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
                  local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j];
                }
            }
            """%locals()

        for ipos in xrange(len(node.outputs)):
            print >> sio, """
            for(int i=nd_collapse-1;i>0;i--){
              if(nd_collapse_[i]==1){
                local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides
                for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array
                  local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j];
                }
            }
            """%locals()

    # update the local dims.
        print >> sio, """
        for(int i=nd_collapse-1;i>0;i--){
          if(nd_collapse_[i]==1){
            local_dims[i-1]*=local_dims[i];//set new dims
            for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array
              local_dims[j-1]=local_dims[j];
          }
        }
        """%locals()

    # update the new number of dim
        print >> sio, """
        for(int i=1, end=nd_collapse;i<end;i++){
          if(nd_collapse_[i]==1)nd_collapse--;
        }
        if(nd_collapse == 1 """%locals()
        l = ["local_str[%s][nd_collapse-1]==1 "%ipos for ipos in xrange(len(node.inputs)) if not _logical_scalar(node.inputs[ipos])]
        l += ["local_ostr[%s][nd_collapse-1]==1 "%ipos for ipos in xrange(len(node.outputs)) if not _logical_scalar(node.outputs[ipos])]
        if len(l) > 0:
            print >> sio, " && ", " && ".join(l)
        print >> sio, """){nd_collapse=0;} """

        if self.verbose:
            print >> sio, 'std::cerr <<"after can_collapse\\n";'
            print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals()
        if self.verbose > 1:
            for d in xrange(nd):
                print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals()
            print >> sio, 'std::cerr << "\\n";'
            if nd > 0:
                for ipos in xrange(len(node.inputs)):
                    print >> sio, 'std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%s][%s]"%(ipos, x) for x in xrange(nd)])+'<<"\\n";'
                    for ipos in xrange(len(node.outputs)):
                        print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%s][%s]"%(ipos, x) for x in xrange(nd)])+'<<"\\n";'

        def launch_Ccontiguous(nodename, scalar_op, sync=True):
            kernel_call_args = ["numEls"]
            for ipos in xrange(len(node.inputs)):
                kernel_call_args.append("i%i_data"%ipos)
            for ipos in xrange(len(node.outputs)):
                kernel_call_args.append("o%i_data"%ipos)
            kernel_call_args = ", ".join(kernel_call_args)
            verb = ""
            if self.verbose:
                verb = 'std::cerr << "   Running ccontiguous version\\n";'
            print >> sio, """
                //first use at least a full warp
                int threads_per_block = std::min(numEls,  (unsigned int)32); //WARP SIZE

                //next start adding multiprocessors
                int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS

                // next start adding more warps per multiprocessor
                if (threads_per_block * n_blocks < numEls)
                    threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);
                kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);

                //std::cerr << "calling callkernel returned\\n";
                """ % locals()
            if sync:
                print >> sio, """
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if( cudaSuccess != err)
                {
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n    n_blocks=%%i threads_per_block=%%i\\n   Call: %%s\\n",
                         "GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err),
                         n_blocks, threads_per_block,
                         "kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
                    return -1;

                }
                %(verb)s
                return 0;
                """ % locals()
            else:
                print >> sio, " return 0; " % locals()

        def launch_General(nodename, scalar_op, force_nd, sync=True):
            # kernel_call_args are used to invoke the cuda kernel
            local = "local_"
            kernel_call_args = ["numEls"]
            kernel_call_args.extend(local+"dims[%i]"%di for di in xrange(force_nd))
            for ipos in xrange(len(node.inputs)):
                kernel_call_args += ["i%i_data"%ipos] + list(local+"str[%i][%i]"%(ipos, di) for di in xrange(force_nd))
                #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
                #kernel_call_args.append( "%s, i%i_data" % (strides, ipos))
            for ipos in xrange(len(node.outputs)):
                kernel_call_args += ["o%i_data"%ipos] + list(local+"ostr[%i][%i]"%(ipos, di) for di in xrange(force_nd))
                #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd))
                #kernel_call_args.append( "%s, o%i_data" % (strides, ipos))
            if self.verbose:
                print >> sio, """
                    std::cerr << "   Running general version with %(force_nd)s  dims\\n";
                    """%locals()
                print >> sio, "std::cerr << " + ' << " " << '.join(kernel_call_args)+' << "\\n";'
                # std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n;

            kernel_call_args = ", ".join(kernel_call_args)

            print >> sio, """
                //first use at least a full warp
                int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE

                //next start adding multiprocessors
                int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS

                // next start adding more warps per multiprocessor
                if (threads_per_block * n_blocks < numEls)
                    threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK);

                kernel_%(scalar_op)s_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s);
                """ % locals()
            if sync:
                print >> sio, """
                CNDA_THREAD_SYNC;
                cudaError_t err = cudaGetLastError();
                if( cudaSuccess != err)
                {
                    PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n    n_blocks=%%i threads_per_block=%%i\\n   Call: %%s\\n",
                         "GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err),
                         n_blocks, threads_per_block,
                         "kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)");
                    return -1;

                }
                return 0;
                """ % locals()
            else:
                print >> sio, " return 0; " % locals()
        print >> sio, "if(numEls==0) return 0;"
        print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals()
        print >> sio, "case 0: {"
        launch_Ccontiguous(nodename, scalar_op, self.sync)
        print >> sio, "        } break;"
        for i in xrange(1, nd+1):
            print >> sio, "case "+str(i)+": {"
            launch_General(nodename, scalar_op, i, self.sync)
            print >> sio, "        } break;"

        print >> sio, "}"  # end case
        print >> sio, "return -2;"  # should not get to this point
        print >> sio, "}"  # end fct

        # N.B. cudaGetLastError is called by c_code
        return sio.getvalue()
Example #42
0
def debugprint(obj, depth=-1, print_type=False,
               file=None, ids='CHAR', stop_on_name=False):
    """Print a computation graph as text to stdout or a file.

    :type obj: Variable, Apply, or Function instance
    :param obj: symbolic thing to print
    :type depth: integer
    :param depth: print graph to this depth (-1 for unlimited)
    :type print_type: boolean
    :param print_type: whether to print the type of printed objects
    :type file: None, 'str', or file-like object
    :param file: print to this file ('str' means to return a string)
    :type ids: str
    :param ids: How do we print the identifier of the variable
                id - print the python id value
                int - print integer character
                CHAR - print capital character
                "" - don't print an identifier
    :param stop_on_name: When True, if a node in the graph has a name,
                         we don't print anything below it.

    :returns: string if `file` == 'str', else file arg

    Each line printed represents a Variable in the graph.
    The indentation of lines corresponds to its depth in the symbolic graph.
    The first part of the text identifies whether it is an input
    (if a name or type is printed) or the output of some Apply (in which case
    the Op is printed).
    The second part of the text is an identifier of the Variable.
    If print_type is True, we add a part containing the type of the Variable

    If a Variable is encountered multiple times in the depth-first search,
    it is only printed recursively the first time. Later, just the Variable
    identifier is printed.

    If an Apply has multiple outputs, then a '.N' suffix will be appended
    to the Apply's identifier, to indicate which output a line corresponds to.

    """
    if file == 'str':
        _file = StringIO()
    elif file is None:
        _file = sys.stdout
    else:
        _file = file
    done = dict()
    results_to_print = []
    order = []
    if isinstance(obj, (list, tuple)):
        lobj = obj
    else:
        lobj = [obj]
    for obj in lobj:
        if isinstance(obj, gof.Variable):
            results_to_print.append(obj)
        elif isinstance(obj, gof.Apply):
            results_to_print.extend(obj.outputs)
        elif isinstance(obj, Function):
            results_to_print.extend(obj.maker.fgraph.outputs)
            order = obj.maker.fgraph.toposort()
        elif isinstance(obj, gof.FunctionGraph):
            results_to_print.extend(obj.outputs)
            order = obj.toposort()
        elif isinstance(obj, (int, long, float, numpy.ndarray)):
            print obj
        else:
            raise TypeError("debugprint cannot print an object of this type",
                            obj)

    scan_ops = []
    for r in results_to_print:
        #Add the parent scan op to the list as well
        if hasattr(r.owner, 'op') and isinstance(r.owner.op, theano.scan_module.scan_op.Scan):
            scan_ops.append(r)

        debugmode.debugprint(r, depth=depth, done=done, print_type=print_type,
                             file=_file, order=order, ids=ids,
                             scan_ops=scan_ops, stop_on_name=stop_on_name)
    if len(scan_ops) > 0:
        print >> file, ""
        new_prefix = ' >'
        new_prefix_child = ' >'
        print >> file, "Inner graphs of the scan ops:"

        for s in scan_ops:
            print >> file, ""
            debugmode.debugprint(s, depth=depth, done=done, print_type=print_type,
                                 file=_file, ids=ids,
                                 scan_ops=scan_ops, stop_on_name=stop_on_name)

            for idx, i in enumerate(s.owner.op.outputs):
                if hasattr(i, 'owner') and hasattr(i.owner, 'op'):
                    if isinstance(i.owner.op, theano.scan_module.scan_op.Scan):
                        scan_ops.append(i)

                debugmode.debugprint(r=i, prefix=new_prefix, depth=depth, done=done,
                                     print_type=print_type, file=file,
                                     ids=ids, stop_on_name=stop_on_name,
                                     prefix_child=new_prefix_child, scan_ops=scan_ops)

    if file is _file:
        return file
    elif file == 'str':
        return _file.getvalue()
    else:
        _file.flush()
Example #43
0
    def c_src_kernel(self, node, nodename, nd):
        sio = StringIO()
        # print 'C_SRC_KERNEL', sio.getvalue()

        for ipos, i in enumerate(node.inputs):
            print >> sio, "//    Input  ", ipos, str(i.type)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "//    Output ", ipos, str(i.type)
        print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % (
            self.scalar_op.__class__.__name__, nodename, nd)
        if (nd):
            print >> sio, "\t,", ", ".join("const int dim%i" % i
                                           for i in xrange(nd))
        # declare inputs
        for ipos, i in enumerate(node.inputs):
            s = ", ".join(["const float * i%i_data" % ipos] +
                          ["int i%i_str_%i" % (ipos, d) for d in xrange(nd)])
            print >> sio, "\t,", s
        # declare outputs
        for ipos, i in enumerate(node.outputs):
            s = ", ".join(["float * o%i_data" % ipos] +
                          ["int o%i_str_%i" % (ipos, d) for d in xrange(nd)])
            print >> sio, "\t,", s
            #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd))
            #print >> sio, "\t,", "float * o%i_data" % ipos
        print >> sio, "\t)\n{"
        print >> sio, "    const int idx = blockIdx.x * blockDim.x + threadIdx.x;"
        print >> sio, "    const int numThreads = blockDim.x * gridDim.x;"

        # For each input that is a scalar which has been broadcasted to a tensor,
        #     load it into a local variable
        for ipos, i in enumerate(node.inputs):
            if _logical_scalar(i):
                print >> sio, "    const float ii_i%i_value = i%i_data[0];" % (ipos, ipos)

        # loop over the elements to be treated by this kernel call
        print >> sio, "    for (int i = idx; i < numEls; i += numThreads) {"
        # calculate the data pointers for all arguments
        print >> sio, "        int ii = i;"
        for ipos, i in enumerate(node.inputs):
            if not _logical_scalar(i):
                print >> sio, "        const float * ii_i%i_data = i%i_data;" % (ipos, ipos)
        for ipos, i in enumerate(node.outputs):
            print >> sio, "        float * ii_o%i_data = o%i_data;" % (ipos, ipos)
        for d in xrange(nd-1, -1, -1):
            if d > 0:
                print >> sio, "        int pos%i = ii %% dim%i;" % (d, d)
                print >> sio, "        ii = ii / dim%i;" % d
            else:
                print >> sio, "        int pos%i = ii;" % d

            for ipos, i in enumerate(node.inputs):
                if not _logical_scalar(i):
                    print >> sio, "        ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d)
            for ipos, i in enumerate(node.outputs):
                print >> sio, "        ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d)

        # perform the scalar operation on the input and output references
        # TODO: What if the scalar_op needs support_code??
        task_code = self.scalar_op.c_code(
            Apply(self.scalar_op,
                  [scalar.Scalar(dtype=input.type.dtype).make_variable()
                   for input in node.inputs],
                  [scalar.Scalar(dtype=output.type.dtype).make_variable()
                   for output in node.outputs]),
            nodename + '_scalar_',
            get_str_list_logical_scalar(node),
            ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)],
            sub=dict(fail='return;'))  # TODO: set a failure code somehow!!!
        print >> sio, "       ", task_code
        print >> sio, "    }"

        #indent = " "*(4*d+7)
        # for ipos, i in enumerate(node.inputs):
            #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', ''
        print >> sio, "}"

        # print sio.getvalue()
        return sio.getvalue()
Example #44
0
def test_debugprint():
    A = tensor.matrix(name='A')
    B = tensor.matrix(name='B')
    C = A + B
    C.name = 'C'
    D = tensor.matrix(name='D')
    E = tensor.matrix(name='E')

    F = D + E
    G = C + F

    # just test that it work
    debugprint(G)

    # test ids=int
    s = StringIO()
    debugprint(G, file=s, ids='int')
    s = s.getvalue()
    # The additional white space are needed!
    reference = """Elemwise{add,no_inplace} [@0] ''   
 |Elemwise{add,no_inplace} [@1] 'C'   
 | |A [@2]
 | |B [@3]
 |Elemwise{add,no_inplace} [@4] ''   
   |D [@5]
   |E [@6]
"""

    if s != reference:
        print '--' + s + '--'
        print '--' + reference + '--'

    assert s == reference

    # test ids=CHAR
    s = StringIO()
    debugprint(G, file=s, ids='CHAR')
    s = s.getvalue()
    # The additional white space are needed!
    reference = """Elemwise{add,no_inplace} [@A] ''   
 |Elemwise{add,no_inplace} [@B] 'C'   
 | |A [@C]
 | |B [@D]
 |Elemwise{add,no_inplace} [@E] ''   
   |D [@F]
   |E [@G]
"""

    if s != reference:
        print '--' + s + '--'
        print '--' + reference + '--'

    assert s == reference

    # test ids=CHAR, stop_on_name=True
    s = StringIO()
    debugprint(G, file=s, ids='CHAR', stop_on_name=True)
    s = s.getvalue()
    # The additional white space are needed!
    reference = """Elemwise{add,no_inplace} [@A] ''   
 |Elemwise{add,no_inplace} [@B] 'C'   
 |Elemwise{add,no_inplace} [@C] ''   
   |D [@D]
   |E [@E]
"""

    if s != reference:
        print '--' + s + '--'
        print '--' + reference + '--'

    assert s == reference

    # test ids=
    s = StringIO()
    debugprint(G, file=s, ids='')
    s = s.getvalue()
    # The additional white space are needed!
    reference = """Elemwise{add,no_inplace}  ''   
 |Elemwise{add,no_inplace}  'C'   
 | |A 
 | |B 
 |Elemwise{add,no_inplace}  ''   
   |D 
   |E 
"""
    if s != reference:
        print '--' + s + '--'
        print '--' + reference + '--'

    assert s == reference
Example #45
0
    def c_code(self, node, nodename, inputs, outputs, sub):
        d = dict(sub)
        nd = node.outputs[0].type.ndim
        d.update(locals())
        sio = StringIO()
        nin = len(inputs)
        nout = len(outputs)
        fail = sub['fail']
        opname = str(self.scalar_op)
        initial_dims = ','.join('1' for i in xrange(nd))
        if 1 or self.scalar_op == scalar.pow:
            print >> sio, """
        //std::cerr << "C_CODE %(opname)s START\\n";
        //standard elemwise size checks
            """ % locals()
        if nd > 0:
            print >> sio, """
            int dims[%(nd)s] = {%(initial_dims)s};
            """ % locals()
        else:
            print >> sio, """
            int *dims = NULL;
            """

        # check that all inputs have valid dimensions
        emitted_inames = {}
        for id, iname in enumerate(inputs):
            if iname in emitted_inames:
                assert emitted_inames[iname] is node.inputs[id]
                continue

            # with python 2.4 (at least), if a broadcastable pattern is made of
            # numpy.bool_ instead of bool, calling int() once is not enough.
            broadcasts = map(int, map(int, node.inputs[id].broadcastable))
            broadcasts = ', '.join(map(str, broadcasts))
            nd = node.inputs[id].ndim
            if nd > 0:
                print >> sio, """
                int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s};
                """ % locals()
            else:
                print >> sio, """
                int *broadcasts_%(iname)s = NULL;
                """ % locals()
            emitted_inames[iname] = node.inputs[id]

        # check that all inputs have valid dimensions
        emitted_inames = {}
        for id, iname in enumerate(inputs):
            if iname in emitted_inames:
                continue
            print >> sio, """
        //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n";
        if (%(nd)s != %(iname)s->nd)
        {
            PyErr_Format(PyExc_TypeError,
                         "need %(nd)s dims, not %%i", %(iname)s->nd);
            %(fail)s;
        }
        for (int i = 0; i< %(nd)s; ++i)
        {
            dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i];
            if ((!(broadcasts_%(iname)s[i] &&
                 CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1)) &&
                (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i]))
            {
                //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n";
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Input dimension mis-match. Input"
                             " %(id)d (indices start at 0) has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             CudaNdarray_HOST_DIMS(%(iname)s)[i],
                             dims[i]
                            );
                %(fail)s;
            }
        }
            """ % locals()
            emitted_inames[iname] = True

        # check that all outputs have valid dimensions
        for idx, oname in enumerate(outputs):
            if idx not in self.inplace_pattern.keys():
                print >> sio, """
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
            {
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
            }
        }
        if (%(oname)s && !CudaNdarray_is_c_contiguous(%(oname)s))
        {
            Py_XDECREF(%(oname)s);
            %(oname)s = NULL;
        }
        if (NULL == %(oname)s)
        {
            %(oname)s = (CudaNdarray*)CudaNdarray_New();
            if (!%(oname)s)
            {
                //error string already set
                %(fail)s;
            }
            if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims))
            {
                //error string already set
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                %(fail)s;
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()
            else:
                input_idx = self.inplace_pattern[idx]
                iname = inputs[input_idx]
                print >> sio, """
        Py_XDECREF(%(oname)s);
        %(oname)s = %(iname)s;
        Py_INCREF(%(oname)s);
        for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) {
            if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i])
            {
                PyErr_Format(PyExc_ValueError,
                             "GpuElemwise. Output dimension mis-match. Output"
                             " %(idx)d (indices start at 0), working inplace"
                             " on input %(input_idx)s, has shape[%%i] == %%i"
                             ", but the output's size on that axis is %%i.",
                             i,
                             CudaNdarray_HOST_DIMS(%(oname)s)[i],
                             dims[i]
                            );
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                %(fail)s;
            }
        }
        //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n";
        //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n";
        """ % locals()

        print >> sio, """
        {
            //new block so that failure gotos don't skip over variable initialization
            //std::cerr << "calling callkernel\\n";
            if (callkernel_%(nodename)s(1, 0, dims
            """ % locals()
        for iname in inputs:
            print >> sio, """
                        , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s)
            """ % locals()
        for oname in outputs:
            print >> sio, """
                        , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s)
            """ % locals()
        print >> sio, """
                        ))
            {
                 // error
            """
        for oname in outputs:
            print >> sio, """
                Py_DECREF(%(oname)s);
                %(oname)s = NULL;
                """ % locals()
        print >> sio, """
                %(fail)s;
            }
            else // no error
            {
            }
        }
        //std::cerr << "C_CODE %(opname)s END\\n";
        """ % locals()
        # print sio.getvalue()
        return sio.getvalue()
Example #46
0
 def __str__(self):
     sio = StringIO()
     self.print_summary(sio)
     return sio.getvalue()
Example #47
0
def debugprint(obj, depth=-1, print_type=False,
               file=None, ids='CHAR', stop_on_name=False,
               done=None):
    """Print a computation graph as text to stdout or a file.

    :type obj: Variable, Apply, or Function instance
    :param obj: symbolic thing to print
    :type depth: integer
    :param depth: print graph to this depth (-1 for unlimited)
    :type print_type: boolean
    :param print_type: whether to print the type of printed objects
    :type file: None, 'str', or file-like object
    :param file: print to this file ('str' means to return a string)
    :type ids: str
    :param ids: How do we print the identifier of the variable
                id - print the python id value
                int - print integer character
                CHAR - print capital character
                "" - don't print an identifier
    :param stop_on_name: When True, if a node in the graph has a name,
                         we don't print anything below it.
    :type done: None or dict
    :param done: A dict where we store the ids of printed node.
        Useful to have multiple call to debugprint share the same ids.

    :returns: string if `file` == 'str', else file arg

    Each line printed represents a Variable in the graph.
    The indentation of lines corresponds to its depth in the symbolic graph.
    The first part of the text identifies whether it is an input
    (if a name or type is printed) or the output of some Apply (in which case
    the Op is printed).
    The second part of the text is an identifier of the Variable.
    If print_type is True, we add a part containing the type of the Variable

    If a Variable is encountered multiple times in the depth-first search,
    it is only printed recursively the first time. Later, just the Variable
    identifier is printed.

    If an Apply has multiple outputs, then a '.N' suffix will be appended
    to the Apply's identifier, to indicate which output a line corresponds to.

    """
    if not isinstance(depth, int):
        raise Exception("depth parameter must be an int")
    if file == 'str':
        _file = StringIO()
    elif file is None:
        _file = sys.stdout
    else:
        _file = file
    if done is None:
        done = dict()
    results_to_print = []
    profile_list = []
    order = []
    if isinstance(obj, (list, tuple)):
        lobj = obj
    else:
        lobj = [obj]
    for obj in lobj:
        if isinstance(obj, gof.Variable):
            results_to_print.append(obj)
            profile_list.append(None)
        elif isinstance(obj, gof.Apply):
            results_to_print.extend(obj.outputs)
            profile_list.extend([None for item in obj.outputs])
        elif isinstance(obj, Function):
            results_to_print.extend(obj.maker.fgraph.outputs)
            profile_list.extend(
                [obj.profile for item in obj.maker.fgraph.outputs])
            order = obj.maker.fgraph.toposort()
        elif isinstance(obj, gof.FunctionGraph):
            results_to_print.extend(obj.outputs)
            profile_list.extend([None for item in obj.outputs])
            order = obj.toposort()
        elif isinstance(obj, (int, long, float, np.ndarray)):
            print(obj)
        elif isinstance(obj, (theano.In, theano.Out)):
            results_to_print.append(obj.variable)
            profile_list.append(None)
        else:
            raise TypeError("debugprint cannot print an object of this type",
                            obj)

    scan_ops = []
    for r, p in zip(results_to_print, profile_list):
        # Add the parent scan op to the list as well
        if (hasattr(r.owner, 'op') and
                isinstance(r.owner.op, theano.scan_module.scan_op.Scan)):
                    scan_ops.append(r)

        if p is not None:
            print("""
Timing Info
-----------
--> <time> <% time> - <total time> <% total time>'

<time>         computation time for this node
<% time>       fraction of total computation time for this node
<total time>   time for this node + total times for this node's ancestors
<% total time> total time for this node over total computation time

N.B.:
* Times include the node time and the function overhead.
* <total time> and <% total time> may over-count computation times
  if inputs to a node share a common ancestor and should be viewed as a
  loose upper bound. Their intended use is to help rule out potential nodes
  to remove when optimizing a graph because their <total time> is very low.
""", file=_file)

        debugmode.debugprint(r, depth=depth, done=done, print_type=print_type,
                             file=_file, order=order, ids=ids,
                             scan_ops=scan_ops, stop_on_name=stop_on_name,
                             profile=p)
    if len(scan_ops) > 0:
        print("", file=_file)
        new_prefix = ' >'
        new_prefix_child = ' >'
        print("Inner graphs of the scan ops:", file=_file)

        for s in scan_ops:
            print("", file=_file)
            debugmode.debugprint(s, depth=depth, done=done,
                                 print_type=print_type,
                                 file=_file, ids=ids,
                                 scan_ops=scan_ops, stop_on_name=stop_on_name)
            if hasattr(s.owner.op, 'fn'):
                # If the op was compiled, print the optimized version.
                outputs = s.owner.op.fn.maker.fgraph.outputs
            else:
                outputs = s.owner.op.outputs
            for idx, i in enumerate(outputs):
                if hasattr(i, 'owner') and hasattr(i.owner, 'op'):
                    if isinstance(i.owner.op, theano.scan_module.scan_op.Scan):
                        scan_ops.append(i)

                debugmode.debugprint(r=i, prefix=new_prefix,
                                     depth=depth, done=done,
                                     print_type=print_type, file=_file,
                                     ids=ids, stop_on_name=stop_on_name,
                                     prefix_child=new_prefix_child,
                                     scan_ops=scan_ops)

    if file is _file:
        return file
    elif file == 'str':
        return _file.getvalue()
    else:
        _file.flush()
Example #48
0
def test_debugprint():
    A = tensor.matrix(name='A')
    B = tensor.matrix(name='B')
    C = A + B
    C.name = 'C'
    D = tensor.matrix(name='D')
    E = tensor.matrix(name='E')

    F = D + E
    G = C + F

    # just test that it work
    debugprint(G)

    # test ids=int
    s = StringIO()
    debugprint(G, file=s, ids='int')
    s = s.getvalue()
    # The additional white space are needed!
    reference = '\n'.join([
        "Elemwise{add,no_inplace} [@0] ''   ",
        " |Elemwise{add,no_inplace} [@1] 'C'   ",
        " | |A [@2]",
        " | |B [@3]",
        " |Elemwise{add,no_inplace} [@4] ''   ",
        "   |D [@5]",
        "   |E [@6]",
    ]) + '\n'

    if s != reference:
        print('--' + s + '--')
        print('--' + reference + '--')

    assert s == reference

    # test ids=CHAR
    s = StringIO()
    debugprint(G, file=s, ids='CHAR')
    s = s.getvalue()
    # The additional white space are needed!
    reference = "\n".join([
        "Elemwise{add,no_inplace} [@A] ''   ",
        " |Elemwise{add,no_inplace} [@B] 'C'   ",
        " | |A [@C]",
        " | |B [@D]",
        " |Elemwise{add,no_inplace} [@E] ''   ",
        "   |D [@F]",
        "   |E [@G]",
    ]) + '\n'

    if s != reference:
        print('--' + s + '--')
        print('--' + reference + '--')

    assert s == reference

    # test ids=CHAR, stop_on_name=True
    s = StringIO()
    debugprint(G, file=s, ids='CHAR', stop_on_name=True)
    s = s.getvalue()
    # The additional white space are needed!
    reference = '\n'.join([
        "Elemwise{add,no_inplace} [@A] ''   ",
        " |Elemwise{add,no_inplace} [@B] 'C'   ",
        " |Elemwise{add,no_inplace} [@C] ''   ",
        "   |D [@D]",
        "   |E [@E]",
    ]) + '\n'

    if s != reference:
        print('--' + s + '--')
        print('--' + reference + '--')

    assert s == reference

    # test ids=
    s = StringIO()
    debugprint(G, file=s, ids='')
    s = s.getvalue()
    # The additional white space are needed!
    reference = '\n'.join([
        "Elemwise{add,no_inplace}  ''   ",
        " |Elemwise{add,no_inplace}  'C'   ",
        " | |A ",
        " | |B ",
        " |Elemwise{add,no_inplace}  ''   ",
        "   |D ",
        "   |E ",
    ]) + '\n'
    if s != reference:
        print('--' + s + '--')
        print('--' + reference + '--')

    assert s == reference
Example #49
0
def debugprint(obj, depth=-1, print_type=False,
               file=None, ids='CHAR', stop_on_name=False):
    """Print a computation graph as text to stdout or a file.

    :type obj: Variable, Apply, or Function instance
    :param obj: symbolic thing to print
    :type depth: integer
    :param depth: print graph to this depth (-1 for unlimited)
    :type print_type: boolean
    :param print_type: whether to print the type of printed objects
    :type file: None, 'str', or file-like object
    :param file: print to this file ('str' means to return a string)
    :type ids: str
    :param ids: How do we print the identifier of the variable
                id - print the python id value
                int - print integer character
                CHAR - print capital character
                "" - don't print an identifier
    :param stop_on_name: When True, if a node in the graph has a name,
                         we don't print anything below it.

    :returns: string if `file` == 'str', else file arg

    Each line printed represents a Variable in the graph.
    The indentation of lines corresponds to its depth in the symbolic graph.
    The first part of the text identifies whether it is an input
    (if a name or type is printed) or the output of some Apply (in which case
    the Op is printed).
    The second part of the text is an identifier of the Variable.
    If print_type is True, we add a part containing the type of the Variable

    If a Variable is encountered multiple times in the depth-first search,
    it is only printed recursively the first time. Later, just the Variable
    identifier is printed.

    If an Apply has multiple outputs, then a '.N' suffix will be appended
    to the Apply's identifier, to indicate which output a line corresponds to.

    """
    if file == 'str':
        _file = StringIO()
    elif file is None:
        _file = sys.stdout
    else:
        _file = file
    done = dict()
    results_to_print = []
    order = []
    if isinstance(obj, gof.Variable):
        results_to_print.append(obj)
    elif isinstance(obj, gof.Apply):
        results_to_print.extend(obj.outputs)
    elif isinstance(obj, Function):
        results_to_print.extend(obj.maker.fgraph.outputs)
        order = obj.maker.fgraph.toposort()
    elif isinstance(obj, (list, tuple)):
        results_to_print.extend(obj)
    elif isinstance(obj, gof.FunctionGraph):
        results_to_print.extend(obj.outputs)
        order = obj.toposort()
    else:
        raise TypeError("debugprint cannot print an object of this type", obj)
    for r in results_to_print:
        debugmode.debugprint(r, depth=depth, done=done, print_type=print_type,
                             file=_file, order=order, ids=ids,
                             stop_on_name=stop_on_name)
    if file is _file:
        return file
    elif file == 'str':
        return _file.getvalue()
    else:
        _file.flush()
Example #50
0
def debugprint(obj,
               depth=-1,
               print_type=False,
               file=None,
               ids='CHAR',
               stop_on_name=False,
               done=None):
    """Print a computation graph as text to stdout or a file.

    :type obj: Variable, Apply, or Function instance
    :param obj: symbolic thing to print
    :type depth: integer
    :param depth: print graph to this depth (-1 for unlimited)
    :type print_type: boolean
    :param print_type: whether to print the type of printed objects
    :type file: None, 'str', or file-like object
    :param file: print to this file ('str' means to return a string)
    :type ids: str
    :param ids: How do we print the identifier of the variable
                id - print the python id value
                int - print integer character
                CHAR - print capital character
                "" - don't print an identifier
    :param stop_on_name: When True, if a node in the graph has a name,
                         we don't print anything below it.
    :type done: None or dict
    :param done: A dict where we store the ids of printed node.
        Useful to have multiple call to debugprint share the same ids.

    :returns: string if `file` == 'str', else file arg

    Each line printed represents a Variable in the graph.
    The indentation of lines corresponds to its depth in the symbolic graph.
    The first part of the text identifies whether it is an input
    (if a name or type is printed) or the output of some Apply (in which case
    the Op is printed).
    The second part of the text is an identifier of the Variable.
    If print_type is True, we add a part containing the type of the Variable

    If a Variable is encountered multiple times in the depth-first search,
    it is only printed recursively the first time. Later, just the Variable
    identifier is printed.

    If an Apply has multiple outputs, then a '.N' suffix will be appended
    to the Apply's identifier, to indicate which output a line corresponds to.

    """
    if not isinstance(depth, int):
        raise Exception("depth parameter must be an int")
    if file == 'str':
        _file = StringIO()
    elif file is None:
        _file = sys.stdout
    else:
        _file = file
    if done is None:
        done = dict()
    results_to_print = []
    profile_list = []
    order = []
    if isinstance(obj, (list, tuple)):
        lobj = obj
    else:
        lobj = [obj]
    for obj in lobj:
        if isinstance(obj, gof.Variable):
            results_to_print.append(obj)
            profile_list.append(None)
        elif isinstance(obj, gof.Apply):
            results_to_print.extend(obj.outputs)
            profile_list.extend([None for item in obj.outputs])
        elif isinstance(obj, Function):
            results_to_print.extend(obj.maker.fgraph.outputs)
            profile_list.extend(
                [obj.profile for item in obj.maker.fgraph.outputs])
            order = obj.maker.fgraph.toposort()
        elif isinstance(obj, gof.FunctionGraph):
            results_to_print.extend(obj.outputs)
            profile_list.extend([None for item in obj.outputs])
            order = obj.toposort()
        elif isinstance(obj, (int, long, float, np.ndarray)):
            print obj
        elif isinstance(obj, (theano.In, theano.Out)):
            results_to_print.append(obj.variable)
            profile_list.append(None)
        else:
            raise TypeError("debugprint cannot print an object of this type",
                            obj)

    scan_ops = []
    for r, p in zip(results_to_print, profile_list):
        # Add the parent scan op to the list as well
        if (hasattr(r.owner, 'op')
                and isinstance(r.owner.op, theano.scan_module.scan_op.Scan)):
            scan_ops.append(r)

        if p is not None:
            print >> _file, """
Timing Info
-----------
--> <time> <% time> - <total time> <% total time>'

<time>         computation time for this node
<% time>       fraction of total computation time for this node
<total time>   time for this node + total times for this node's ancestors
<% total time> total time for this node over total computation time

N.B.:
* Times include the node time and the function overhead.
* <total time> and <% total time> may over-count computation times
  if inputs to a node share a common ancestor and should be viewed as a
  loose upper bound. Their intended use is to help rule out potential nodes
  to remove when optimizing a graph because their <total time> is very low.
"""

        debugmode.debugprint(r,
                             depth=depth,
                             done=done,
                             print_type=print_type,
                             file=_file,
                             order=order,
                             ids=ids,
                             scan_ops=scan_ops,
                             stop_on_name=stop_on_name,
                             profile=p)
    if len(scan_ops) > 0:
        print >> _file, ""
        new_prefix = ' >'
        new_prefix_child = ' >'
        print >> _file, "Inner graphs of the scan ops:"

        for s in scan_ops:
            print >> _file, ""
            debugmode.debugprint(s,
                                 depth=depth,
                                 done=done,
                                 print_type=print_type,
                                 file=_file,
                                 ids=ids,
                                 scan_ops=scan_ops,
                                 stop_on_name=stop_on_name)
            if hasattr(s.owner.op, 'fn'):
                # If the op was compiled, print the optimized version.
                outputs = s.owner.op.fn.maker.fgraph.outputs
            else:
                outputs = s.owner.op.outputs
            for idx, i in enumerate(outputs):
                if hasattr(i, 'owner') and hasattr(i.owner, 'op'):
                    if isinstance(i.owner.op, theano.scan_module.scan_op.Scan):
                        scan_ops.append(i)

                debugmode.debugprint(r=i,
                                     prefix=new_prefix,
                                     depth=depth,
                                     done=done,
                                     print_type=print_type,
                                     file=_file,
                                     ids=ids,
                                     stop_on_name=stop_on_name,
                                     prefix_child=new_prefix_child,
                                     scan_ops=scan_ops)

    if file is _file:
        return file
    elif file == 'str':
        return _file.getvalue()
    else:
        _file.flush()
Example #51
0
    def c_code(self, node, nodename, inp, out, sub):
        x, b, y_idx = inp
        nll, sm, am = out
        classname = self.__class__.__name__
        fail = sub['fail']
        sio = StringIO()
        print >> sio, """
        if (CudaNdarray_NDIM(%(y_idx)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor");
            %(fail)s;
        }
        if (CudaNdarray_NDIM(%(x)s) != 2)
        {
            PyErr_SetString(PyExc_ValueError, "x not 2d tensor");
            %(fail)s;
        }
        if (CudaNdarray_NDIM(%(b)s) != 1)
        {
            PyErr_SetString(PyExc_ValueError, "b not 1d tensor");
            %(fail)s;
        }
        if (CudaNdarray_HOST_DIMS(%(x)s)[0] !=
            CudaNdarray_HOST_DIMS(%(y_idx)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "dimension mismatch in x,y_idx arguments");
            %(fail)s;
        }
        if (CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0])
        {
            PyErr_SetString(PyExc_ValueError,
                            "dimension mismatch in x,b arguments");
            %(fail)s;
        }
        if ((NULL == %(nll)s) //initial condition
            || (CudaNdarray_HOST_DIMS(%(nll)s)[0] !=
                CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
        {
            Py_XDECREF(%(nll)s);
            %(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1,
                CudaNdarray_HOST_DIMS(%(y_idx)s));
            if(!%(nll)s)
            {
                %(fail)s;
            }
        }
        if ((NULL == %(sm)s)
            || (CudaNdarray_HOST_DIMS(%(sm)s)[0] !=
                CudaNdarray_HOST_DIMS(%(x)s)[0])
            || (CudaNdarray_HOST_DIMS(%(sm)s)[1] !=
                CudaNdarray_HOST_DIMS(%(x)s)[1]))
        {
            Py_XDECREF(%(sm)s);
            %(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2,
                CudaNdarray_HOST_DIMS(%(x)s));
            if(!%(sm)s)
            {
                PyErr_SetString(PyExc_MemoryError,
                                "failed to alloc sm output");
                // no need to decref cnda_nll, the cleanup code should do it up
                %(fail)s;
            }
        }
        if ((NULL == %(am)s)
            || (CudaNdarray_HOST_DIMS(%(am)s)[0] !=
                CudaNdarray_HOST_DIMS(%(y_idx)s)[0]))
        {
            Py_XDECREF(%(am)s);
            %(am)s = (CudaNdarray*) CudaNdarray_NewDims(1,
                CudaNdarray_HOST_DIMS(%(y_idx)s));
            if(!%(am)s)
            {
                PyErr_SetString(PyExc_MemoryError,
                                "failed to alloc am output");
                // no need to decref nll and sm,
                // the cleanup code should do it up
                %(fail)s;
            }
        }
        {
            int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0],
                                    NUM_VECTOR_OP_BLOCKS);
     //TODO: launch more threads per row and do parallel sum and max reductions
            int n_threads = 1;
            int n_shared_bytes = 0; //n_threads * sizeof(float);

            k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>(
                CudaNdarray_HOST_DIMS(%(x)s)[0],
                CudaNdarray_HOST_DIMS(%(x)s)[1],
                CudaNdarray_DEV_DATA(%(x)s),
                CudaNdarray_HOST_STRIDES(%(x)s)[0],
                CudaNdarray_HOST_STRIDES(%(x)s)[1],
                CudaNdarray_DEV_DATA(%(b)s),
                CudaNdarray_HOST_STRIDES(%(b)s)[0],
                CudaNdarray_DEV_DATA(%(y_idx)s),
                CudaNdarray_HOST_STRIDES(%(y_idx)s)[0],
                CudaNdarray_DEV_DATA(%(nll)s),
                CudaNdarray_HOST_STRIDES(%(nll)s)[0],
                CudaNdarray_DEV_DATA(%(sm)s),
                CudaNdarray_HOST_STRIDES(%(sm)s)[0],
                CudaNdarray_HOST_STRIDES(%(sm)s)[1],
                CudaNdarray_DEV_DATA(%(am)s),
                CudaNdarray_HOST_STRIDES(%(am)s)[0]);
            CNDA_THREAD_SYNC;
            cudaError_t err = cudaGetLastError();
            if (cudaSuccess != err)
            {
                PyErr_Format(PyExc_RuntimeError,
                             "Cuda error: %(classname)s %(nodename)s: %%s.\\n"
                             "The kernel was launched with %%d threads,"
                             " %%d blocks and %%d shared memory\\n",
                             cudaGetErrorString(err),
                             n_threads, n_blocks, n_shared_bytes);
                // no need to decref output vars the cleanup code will do it
                %(fail)s;
            }
        }
        """ % locals()
        return sio.getvalue()