def test_pydotprint_cond_highlight(): """ This is a REALLY PARTIAL TEST. I did them to help debug stuff. """ # Skip test if pydot is not available. if not theano.printing.pydot_imported: raise SkipTest('pydot not available') x = tensor.dvector() f = theano.function([x], x * 2) f([1, 2, 3, 4]) s = StringIO() new_handler = logging.StreamHandler(s) new_handler.setLevel(logging.DEBUG) orig_handler = theano.logging_default_handler theano.theano_logger.removeHandler(orig_handler) theano.theano_logger.addHandler(new_handler) try: theano.printing.pydotprint(f, cond_highlight=True, print_output_file=False) finally: theano.theano_logger.addHandler(orig_handler) theano.theano_logger.removeHandler(new_handler) assert (s.getvalue() == 'pydotprint: cond_highlight is set but there' ' is no IfElse node in the graph\n')
def get_correct_indentation_diff(code, filename): """ Generate a diff to make code correctly indented. :param code: a string containing a file's worth of Python code :param filename: the filename being considered (used in diff generation only) :returns: a unified diff to make code correctly indented, or None if code is already correctedly indented """ code_buffer = StringIO(code) output_buffer = StringIO() reindenter = reindent.Reindenter(code_buffer) reindenter.run() reindenter.write(output_buffer) reindent_output = output_buffer.getvalue() output_buffer.close() if code != reindent_output: diff_generator = difflib.unified_diff(code.splitlines(True), reindent_output.splitlines(True), fromfile=filename, tofile=filename + " (reindented)") # work around http://bugs.python.org/issue2142 diff_tuple = map(clean_diff_line_for_python_bug_2142, diff_generator) diff = "".join(diff_tuple) return diff else: return None
def c_code(self, node, name, inputs, outputs, sub): #z_out = alpha * dot(x,y) + beta * z_in #inplace version, set set z_out = z_in #not inplace version, we copy z_in to z_out. z_in, a, x, y, b = inputs z_out, = outputs inplace = int(self.inplace) fail = sub['fail'] sio = StringIO() print >> sio, """ float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0]; float %(name)s_beta = ((dtype_%(b)s*)(%(b)s->data))[0]; if (%(inplace)s && ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] > 0) || ((CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 0) && (CudaNdarray_HOST_DIMS(%(z_in)s)[0] == 1)))) { // Work inplace on the input Py_XDECREF(%(z_out)s); %(z_out)s = %(z_in)s; Py_INCREF(%(z_out)s); } else if (%(z_out)s && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == CudaNdarray_HOST_DIMS(%(z_in)s)[0]) && ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] > 0) || ((CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 0) && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == 1)))) { // Work on the output if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s)) { %(fail)s; } } else { // Copy Py_XDECREF(%(z_out)s); %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s); if (!%(z_out)s) { %(fail)s; } } if (CudaNdarray_sgemv(%(name)s_alpha, %(x)s, %(y)s, %(name)s_beta, %(z_out)s)) { %(fail)s; } """ return sio.getvalue() % locals()
def c_src_kernel_Ccontiguous(self, node, nodename): nd = node.outputs[0].type.ndim sio = StringIO() #print 'C_SRC_KERNEL', sio.getvalue() for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_Ccontiguous (unsigned int numEls" % ( self.scalar_op.__class__.__name__, nodename) #declare inputs for ipos, i in enumerate(node.inputs): print >> sio, "\t,", "const float * i%i_data" % ipos #declare outputs for ipos, i in enumerate(node.outputs): print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;" print >> sio, " const int numThreads = blockDim.x * gridDim.x;" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable for ipos, i in enumerate(node.inputs): if _logical_scalar(i): print >> sio, " const float ii_i%i_value = i%i_data[0];" % ( ipos, ipos) #loop over the elements to be treated by this kernel call print >> sio, " for (int i = idx; i < numEls; i += numThreads) {" # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? task_code = self.scalar_op.c_code( Apply(self.scalar_op, [ scalar.Scalar(dtype=input.type.dtype)() for input in node.inputs ], [ scalar.Scalar(dtype=output.type.dtype)() for output in node.outputs ]), nodename + '_scalar_' #, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)] , get_str_list_logical_scalar(node, data_str='i%i_data[i]'), ['o%i_data[i]' % ipos for ipos, i in enumerate(node.outputs)], sub=dict(fail='return;')) #TODO: set a failure code somehow!!! print >> sio, " ", task_code print >> sio, " }" print >> sio, "}" #print sio.getvalue() return sio.getvalue()
def run(replay, log=None): if not replay: log = StringIO() else: log = StringIO(log) record = Record(replay=replay, file_object=log) disturb_mem.disturb_mem() mode = RecordMode(record=record) b = sharedX(np.zeros((2, )), name='b') channels = OrderedDict() disturb_mem.disturb_mem() v_max = b.max(axis=0) v_min = b.min(axis=0) v_range = v_max - v_min updates = [] for i, val in enumerate([ v_max.max(), v_max.min(), v_range.max(), ]): disturb_mem.disturb_mem() s = sharedX(0., name='s_' + str(i)) updates.append((s, val)) for var in theano.gof.graph.ancestors(update for _, update in updates): if var.name is not None and var.name is not 'b': if var.name[0] != 's' or len(var.name) != 2: var.name = None for key in channels: updates.append((s, channels[key])) f = theano.function([], mode=mode, updates=updates, on_unused_input='ignore', name='f') for output in f.maker.fgraph.outputs: mode.record.handle_line(var_descriptor(output) + '\n') disturb_mem.disturb_mem() f() mode.record.f.flush() if not replay: return log.getvalue()
def run(replay, log=None): if not replay: log = StringIO() else: log = StringIO(log) record = Record(replay=replay, file_object=log) disturb_mem.disturb_mem() mode = RecordMode(record=record) b = sharedX(np.zeros((2,)), name='b') channels = OrderedDict() disturb_mem.disturb_mem() v_max = b.max(axis=0) v_min = b.min(axis=0) v_range = v_max - v_min updates = [] for i, val in enumerate([ v_max.max(), v_max.min(), v_range.max(), ]): disturb_mem.disturb_mem() s = sharedX(0., name='s_' + str(i)) updates.append((s, val)) for var in theano.gof.graph.ancestors(update for _, update in updates): if var.name is not None and var.name is not 'b': if var.name[0] != 's' or len(var.name) != 2: var.name = None for key in channels: updates.append((s, channels[key])) f = theano.function([], mode=mode, updates=updates, on_unused_input='ignore', name='f') for output in f.maker.fgraph.outputs: mode.record.handle_line(var_descriptor(output) + '\n') disturb_mem.disturb_mem() f() mode.record.f.flush() if not replay: return log.getvalue()
def test_pydotprint_long_name(): """This is a REALLY PARTIAL TEST. It prints a graph where there are variable and apply nodes whose long names are different, but not the shortened names. We should not merge those nodes in the dot graph. """ # Skip test if pydot is not available. if not theano.printing.pydot_imported: raise SkipTest('pydot not available') x = tensor.dvector() mode = theano.compile.mode.get_default_mode().excluding("fusion") f = theano.function([x], [x * 2, x + x], mode=mode) f([1, 2, 3, 4]) s = StringIO() new_handler = logging.StreamHandler(s) new_handler.setLevel(logging.DEBUG) orig_handler = theano.logging_default_handler theano.printing.pydotprint(f, max_label_size=5, print_output_file=False, assert_nb_all_strings=6)
def test_pydotprint_variables(): """ This is a REALLY PARTIAL TEST. I did them to help debug stuff. It make sure the code run. """ # Skip test if pydot is not available. if not theano.printing.pydot_imported: raise SkipTest('pydot not available') x = tensor.dvector() s = StringIO() new_handler = logging.StreamHandler(s) new_handler.setLevel(logging.DEBUG) orig_handler = theano.logging_default_handler theano.theano_logger.removeHandler(orig_handler) theano.theano_logger.addHandler(new_handler) try: theano.printing.pydotprint(x * 2) theano.printing.pydotprint_variables(x * 2) finally: theano.theano_logger.addHandler(orig_handler) theano.theano_logger.removeHandler(new_handler)
def test_dnn_tag(): """ Test that if cudnn isn't avail we crash and that if it is avail, we use it. """ x = T.ftensor4() old = theano.config.on_opt_error theano.config.on_opt_error = "raise" sio = StringIO() handler = logging.StreamHandler(sio) logging.getLogger('theano.compile.tests.test_dnn').addHandler(handler) # Silence original handler when intentionnally generating warning messages logging.getLogger('theano').removeHandler(theano.logging_default_handler) raised = False try: f = theano.function([x], max_pool_2d(x, ds=(2, 2), ignore_border=True), mode=mode_with_gpu.including("cudnn")) except (AssertionError, RuntimeError): assert not cuda.dnn.dnn_available() raised = True finally: theano.config.on_opt_error = old logging.getLogger('theano.compile.tests.test_dnn').removeHandler( handler) logging.getLogger('theano').addHandler(theano.logging_default_handler) if not raised: assert cuda.dnn.dnn_available() assert any([ isinstance(n.op, cuda.dnn.GpuDnnPool) for n in f.maker.fgraph.toposort() ])
def c_src_kernel_Ccontiguous(self, node, nodename): sio = StringIO() # print 'C_SRC_KERNEL', sio.getvalue() for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_Ccontiguous (unsigned int numEls" % (self.scalar_op.__class__.__name__, nodename) # declare inputs for ipos, i in enumerate(node.inputs): print >> sio, "\t,", "const float * i%i_data" % ipos # declare outputs for ipos, i in enumerate(node.outputs): print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;" print >> sio, " const int numThreads = blockDim.x * gridDim.x;" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable for ipos, i in enumerate(node.inputs): if _logical_scalar(i): print >> sio, " const float ii_i%i_value = i%i_data[0];" % (ipos, ipos) # loop over the elements to be treated by this kernel call print >> sio, " for (int i = idx; i < numEls; i += numThreads) {" # perform the scalar operation on the input and output references # TODO: What if the scalar_op needs support_code?? task_code = self.scalar_op.c_code( Apply(self.scalar_op, [scalar.Scalar(dtype=input.type.dtype).make_variable() for input in node.inputs], [scalar.Scalar(dtype=output.type.dtype).make_variable() for output in node.outputs]) , nodename + '_scalar_' #, ['i%i_data[i]'%ipos for ipos, i in enumerate(node.inputs)] , get_str_list_logical_scalar(node, data_str='i%i_data[i]') , ['o%i_data[i]'%ipos for ipos, i in enumerate(node.outputs)] , sub=dict(fail='return;')) # TODO: set a failure code somehow!!! print >> sio, " ", task_code print >> sio, " }" print >> sio, "}" # print sio.getvalue() return sio.getvalue()
def profile_main(): # This is the main function for profiling # We've renamed our original main() above to real_main() import cProfile import pstats from theano.compat.six import StringIO prof = cProfile.Profile() prof = prof.runctx("real_main()", globals(), locals()) stream = StringIO() stats = pstats.Stats(prof) stats.sort_stats("time") # Or cumulative stats.print_stats(80) # 80 = how many to print
def test_pickle_aliased_memory(): M = Module() M.x = (T.dmatrix()) M.y = (T.dmatrix()) a = T.dmatrix() M.f = Method([a], a + M.x + M.y) M.g = Method([a], a * M.x * M.y) mode = get_mode() m = M.make(x=numpy.zeros((4, 5)), y=numpy.ones((2, 3)), mode=mode) m.y = m.x[:] #m's x and y memory is aliased.... m.x[0, 0] = 3.14 assert m.y[0, 0] == 3.14 import logging from theano.compat.six import StringIO sio = StringIO() handler = logging.StreamHandler(sio) logging.getLogger('theano.compile.function_module').addHandler(handler) # Silence original handler when intentionnally generating warning messages logging.getLogger('theano').removeHandler(theano.logging_default_handler) try: m.f.pickle_aliased_memory_strategy = 'warn' m.g.pickle_aliased_memory_strategy = 'warn' m_dup = cPickle.loads(cPickle.dumps(m, protocol=-1)) assert sio.getvalue().startswith('aliased relat') finally: logging.getLogger('theano.compile.function_module').removeHandler( handler) logging.getLogger('theano').addHandler(theano.logging_default_handler) try: m.f.pickle_aliased_memory_strategy = 'raise' m.g.pickle_aliased_memory_strategy = 'raise' m_dup = cPickle.loads(cPickle.dumps(m, protocol=-1)) except AliasedMemoryError, e: return
def test_pickle_aliased_memory(): M = Module() M.x = (T.dmatrix()) M.y = (T.dmatrix()) a = T.dmatrix() M.f = Method([a], a + M.x + M.y) M.g = Method([a], a * M.x * M.y) mode = get_mode() m = M.make(x=numpy.zeros((4,5)), y=numpy.ones((2,3)), mode=mode) m.y = m.x[:] #m's x and y memory is aliased.... m.x[0,0] = 3.14 assert m.y[0,0] == 3.14 import logging from theano.compat.six import StringIO sio = StringIO() handler = logging.StreamHandler(sio) logging.getLogger('theano.compile.function_module').addHandler(handler) # Silence original handler when intentionnally generating warning messages logging.getLogger('theano').removeHandler(theano.logging_default_handler) try: m.f.pickle_aliased_memory_strategy = 'warn' m.g.pickle_aliased_memory_strategy = 'warn' m_dup = cPickle.loads(cPickle.dumps(m, protocol=-1)) assert sio.getvalue().startswith('aliased relat') finally: logging.getLogger('theano.compile.function_module').removeHandler(handler) logging.getLogger('theano').addHandler(theano.logging_default_handler) try: m.f.pickle_aliased_memory_strategy = 'raise' m.g.pickle_aliased_memory_strategy = 'raise' m_dup = cPickle.loads(cPickle.dumps(m, protocol=-1)) except AliasedMemoryError, e: return
def get_parse_error(code): """ Checks code for ambiguous tabs or other basic parsing issues. :param code: a string containing a file's worth of Python code :returns: a string containing a description of the first parse error encountered, or None if the code is ok """ # note that this uses non-public elements from stdlib's tabnanny, because tabnanny # is (very frustratingly) written only to be used as a script, but using it that way # in this context requires writing temporarily files, running subprocesses, blah blah blah code_buffer = StringIO(code) try: tabnanny.process_tokens(tokenize.generate_tokens(code_buffer.readline)) except tokenize.TokenError, err: return "Could not parse code: %s" % err
def test_dnn_tag(): """ We test that if cudnn isn't avail we crash and that if it is avail, we use it. """ x = T.ftensor4() old = theano.config.on_opt_error theano.config.on_opt_error = "raise" sio = StringIO() handler = logging.StreamHandler(sio) logging.getLogger('theano.compile.tests.test_dnn').addHandler(handler) # Silence original handler when intentionnally generating warning messages logging.getLogger('theano').removeHandler(theano.logging_default_handler) raised = False try: f = theano.function([x], max_pool_2d(x, ds=(2, 2)), mode=mode_with_gpu.including("cudnn")) except RuntimeError, e: assert not cuda.dnn.dnn_available() raised = True
def c_code(self, node, nodename, inp, out, sub): typecode_x = pygpu.gpuarray.dtype_to_typecode(node.inputs[0].dtype) typecode_b = pygpu.gpuarray.dtype_to_typecode(node.inputs[1].dtype) typecode_y_idx = pygpu.gpuarray.dtype_to_typecode(node.inputs[2].dtype) itemsize_x = numpy.dtype(node.inputs[0].dtype).itemsize itemsize_b = numpy.dtype(node.inputs[1].dtype).itemsize itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize itemsize_nll = numpy.dtype(node.outputs[0].dtype).itemsize itemsize_sm = numpy.dtype(node.outputs[1].dtype).itemsize itemsize_am = numpy.dtype(node.outputs[2].dtype).itemsize x, b, y_idx = inp nll, sm, am = out dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype dtype_nll = node.outputs[0].dtype dtype_sm = node.outputs[1].dtype dtype_am = node.outputs[2].dtype classname = self.__class__.__name__ fail = sub['fail'] sio = StringIO() print >> sio, """ if (PyGpuArray_NDIM(%(y_idx)s) != 1) { PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor"); %(fail)s; } if (PyGpuArray_NDIM(%(x)s) != 2) { PyErr_SetString(PyExc_ValueError, "x not 2d tensor"); %(fail)s; } if (PyGpuArray_NDIM(%(b)s) != 1) { PyErr_SetString(PyExc_ValueError, "b not 1d tensor"); %(fail)s; } if (PyGpuArray_DIMS(%(x)s)[0] != PyGpuArray_DIMS(%(y_idx)s)[0]) { PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,y_idx arguments"); %(fail)s; } if (PyGpuArray_DIMS(%(x)s)[1] != PyGpuArray_DIMS(%(b)s)[0]) { PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,b arguments"); %(fail)s; } if ((NULL == %(nll)s) //initial condition || (PyGpuArray_DIMS(%(nll)s)[0] != PyGpuArray_DIMS(%(y_idx)s)[0])) { Py_XDECREF(%(nll)s); %(nll)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s), %(typecode_x)s, GA_C_ORDER, pygpu_default_context(), Py_None); if (!%(nll)s) { %(fail)s } } if ((NULL == %(sm)s) || (PyGpuArray_DIMS(%(sm)s)[0] != PyGpuArray_DIMS(%(x)s)[0]) || (PyGpuArray_DIMS(%(sm)s)[1] != PyGpuArray_DIMS(%(x)s)[1])) { Py_XDECREF(%(sm)s); %(sm)s = pygpu_empty(2, PyGpuArray_DIMS(%(x)s), %(typecode_b)s, GA_C_ORDER, pygpu_default_context(), Py_None); if(!%(sm)s) { PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output"); // no need to decref cnda_nll, the cleanup code should do it up %(fail)s; } } if ((NULL == %(am)s) || (PyGpuArray_DIMS(%(am)s)[0] != PyGpuArray_DIMS(%(y_idx)s)[0])) { Py_XDECREF(%(am)s); %(am)s = pygpu_empty(1, PyGpuArray_DIMS(%(y_idx)s), %(typecode_y_idx)s, GA_C_ORDER, pygpu_default_context(), Py_None); if(!%(am)s) { PyErr_SetString(PyExc_MemoryError, "failed to alloc am output"); // no need to decref nll and sm, // the cleanup code should do it up %(fail)s; } } { int n_blocks = PyGpuArray_DIMS(%(x)s)[0] < 256 ? PyGpuArray_DIMS(%(x)s)[0] : 256; //TODO: launch more threads per row and do parallel sum and max reductions int n_threads = 1; int n_shared_bytes = 0; //n_threads * sizeof(dtype); k_xent_sm_1hot_bias_%(nodename)s<<<n_blocks, n_threads, n_shared_bytes>>>( PyGpuArray_DIMS(%(x)s)[0], PyGpuArray_DIMS(%(x)s)[1], (npy_%(dtype_x)s*)(((char *)cuda_get_ptr(%(x)s->ga.data)) + %(x)s->ga.offset), PyGpuArray_STRIDES(%(x)s)[0] / %(itemsize_x)s, PyGpuArray_STRIDES(%(x)s)[1] / %(itemsize_x)s, (npy_%(dtype_b)s*)(((char *)cuda_get_ptr(%(b)s->ga.data)) + %(b)s->ga.offset), PyGpuArray_STRIDES(%(b)s)[0] / %(itemsize_b)s, (npy_%(dtype_y_idx)s*)(((char *)cuda_get_ptr(%(y_idx)s->ga.data)) + %(y_idx)s->ga.offset), PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s, (npy_%(dtype_nll)s*)(((char *)cuda_get_ptr(%(nll)s->ga.data)) + %(nll)s->ga.offset), PyGpuArray_STRIDES(%(nll)s)[0] / %(itemsize_nll)s, (npy_%(dtype_sm)s*)(((char *)cuda_get_ptr(%(sm)s->ga.data)) + %(sm)s->ga.offset), PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s, PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s, (npy_%(dtype_am)s*)(((char *)cuda_get_ptr(%(am)s->ga.data)) + %(am)s->ga.offset), PyGpuArray_STRIDES(%(am)s)[0] / %(itemsize_am)s); cudaError_t err = cudaGetLastError(); if (cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %(classname)s %(nodename)s: %%s.\\n" "The kernel was launched with %%d threads," " %%d blocks and %%d shared memory\\n", cudaGetErrorString(err), n_threads, n_blocks, n_shared_bytes); // no need to decref output vars the cleanup code will do it %(fail)s; } } """ % locals() return sio.getvalue()
def c_code(self, node, name, inputs, outputs, sub): #z_out = alpha * dot(x,y) + beta * z_in #inplace version, set set z_out = z_in #not inplace version, we copy z_in to z_out. z_in, a, x, y, b = inputs z_out, = outputs inplace = int(self.inplace) fail = sub['fail'] sio = StringIO() print >> sio, """ #define REAL float float %(name)s_a = (PyArray_TYPE(%(a)s) == NPY_FLOAT) ? (REAL)(((float*)%(a)s->data)[0]) : (REAL)(((double*)%(a)s->data)[0]); float %(name)s_b = (PyArray_TYPE(%(b)s) == NPY_FLOAT) ? (REAL)(((float*)%(b)s->data)[0]) : (REAL)(((double*)%(b)s->data)[0]); #undef REAL if (%(inplace)s && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0) && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0) && ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1) || (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1) || (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1) || (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1))) { // The input has an appropriate layout, we work inplace Py_XDECREF(%(z_out)s); %(z_out)s = %(z_in)s; Py_INCREF(%(z_out)s); } else if (%(z_out)s && (%(z_out)s->nd == 2) && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == CudaNdarray_HOST_DIMS(%(z_in)s)[0]) && (CudaNdarray_HOST_DIMS(%(z_out)s)[1] == CudaNdarray_HOST_DIMS(%(z_in)s)[1]) && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0) && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0) && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1) || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1) || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1) || (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1))) { // The existing output has an appropriate layout, // copy the input data into it, then work inplace if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s)) { %(fail)s; } } else { // Copy the input, use the copy as output Py_XDECREF(%(z_out)s); %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s); if (!%(z_out)s) { %(fail)s; } } if (CudaNdarray_gemm(%(name)s_a, %(x)s, %(y)s, %(name)s_b, %(z_out)s)) { %(fail)s; } """ return sio.getvalue() % locals()
def c_code(self, node, nodename, inp, out, sub): x, b, y_idx = inp nll, sm, am = out classname = self.__class__.__name__ fail = sub['fail'] sio = StringIO() print >> sio, """ if (CudaNdarray_NDIM(%(y_idx)s) != 1) { PyErr_SetString(PyExc_ValueError, "y_idx not 1d tensor"); %(fail)s; } if (CudaNdarray_NDIM(%(x)s) != 2) { PyErr_SetString(PyExc_ValueError, "x not 2d tensor"); %(fail)s; } if (CudaNdarray_NDIM(%(b)s) != 1) { PyErr_SetString(PyExc_ValueError, "b not 1d tensor"); %(fail)s; } if (CudaNdarray_HOST_DIMS(%(x)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0]) { PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,y_idx arguments"); %(fail)s; } if (CudaNdarray_HOST_DIMS(%(x)s)[1] != CudaNdarray_HOST_DIMS(%(b)s)[0]) { PyErr_SetString(PyExc_ValueError, "dimension mismatch in x,b arguments"); %(fail)s; } if ((NULL == %(nll)s) //initial condition || (CudaNdarray_HOST_DIMS(%(nll)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])) { Py_XDECREF(%(nll)s); %(nll)s = (CudaNdarray*)CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s)); if(!%(nll)s) { %(fail)s; } } if ((NULL == %(sm)s) || (CudaNdarray_HOST_DIMS(%(sm)s)[0] != CudaNdarray_HOST_DIMS(%(x)s)[0]) || (CudaNdarray_HOST_DIMS(%(sm)s)[1] != CudaNdarray_HOST_DIMS(%(x)s)[1])) { Py_XDECREF(%(sm)s); %(sm)s = (CudaNdarray*) CudaNdarray_NewDims(2, CudaNdarray_HOST_DIMS(%(x)s)); if(!%(sm)s) { PyErr_SetString(PyExc_MemoryError, "failed to alloc sm output"); // no need to decref cnda_nll, the cleanup code should do it up %(fail)s; } } if ((NULL == %(am)s) || (CudaNdarray_HOST_DIMS(%(am)s)[0] != CudaNdarray_HOST_DIMS(%(y_idx)s)[0])) { Py_XDECREF(%(am)s); %(am)s = (CudaNdarray*) CudaNdarray_NewDims(1, CudaNdarray_HOST_DIMS(%(y_idx)s)); if(!%(am)s) { PyErr_SetString(PyExc_MemoryError, "failed to alloc am output"); // no need to decref nll and sm, // the cleanup code should do it up %(fail)s; } } { int n_blocks = std::min(CudaNdarray_HOST_DIMS(%(x)s)[0], NUM_VECTOR_OP_BLOCKS); //TODO: launch more threads per row and do parallel sum and max reductions int n_threads = 1; int n_shared_bytes = 0; //n_threads * sizeof(float); k_xent_sm_1hot_bias<<<n_blocks, n_threads, n_shared_bytes>>>( CudaNdarray_HOST_DIMS(%(x)s)[0], CudaNdarray_HOST_DIMS(%(x)s)[1], CudaNdarray_DEV_DATA(%(x)s), CudaNdarray_HOST_STRIDES(%(x)s)[0], CudaNdarray_HOST_STRIDES(%(x)s)[1], CudaNdarray_DEV_DATA(%(b)s), CudaNdarray_HOST_STRIDES(%(b)s)[0], CudaNdarray_DEV_DATA(%(y_idx)s), CudaNdarray_HOST_STRIDES(%(y_idx)s)[0], CudaNdarray_DEV_DATA(%(nll)s), CudaNdarray_HOST_STRIDES(%(nll)s)[0], CudaNdarray_DEV_DATA(%(sm)s), CudaNdarray_HOST_STRIDES(%(sm)s)[0], CudaNdarray_HOST_STRIDES(%(sm)s)[1], CudaNdarray_DEV_DATA(%(am)s), CudaNdarray_HOST_STRIDES(%(am)s)[0]); CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if (cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %(classname)s %(nodename)s: %%s.\\n" "The kernel was launched with %%d threads," " %%d blocks and %%d shared memory\\n", cudaGetErrorString(err), n_threads, n_blocks, n_shared_bytes); // no need to decref output vars the cleanup code will do it %(fail)s; } } """ % locals() return sio.getvalue()
def debugprint(obj, depth=-1, print_type=False, file=None, ids='CHAR', stop_on_name=False): """Print a computation graph as text to stdout or a file. :type obj: Variable, Apply, or Function instance :param obj: symbolic thing to print :type depth: integer :param depth: print graph to this depth (-1 for unlimited) :type print_type: boolean :param print_type: whether to print the type of printed objects :type file: None, 'str', or file-like object :param file: print to this file ('str' means to return a string) :type ids: str :param ids: How do we print the identifier of the variable id - print the python id value int - print integer character CHAR - print capital character "" - don't print an identifier :param stop_on_name: When True, if a node in the graph has a name, we don't print anything below it. :returns: string if `file` == 'str', else file arg Each line printed represents a Variable in the graph. The indentation of lines corresponds to its depth in the symbolic graph. The first part of the text identifies whether it is an input (if a name or type is printed) or the output of some Apply (in which case the Op is printed). The second part of the text is an identifier of the Variable. If print_type is True, we add a part containing the type of the Variable If a Variable is encountered multiple times in the depth-first search, it is only printed recursively the first time. Later, just the Variable identifier is printed. If an Apply has multiple outputs, then a '.N' suffix will be appended to the Apply's identifier, to indicate which output a line corresponds to. """ if file == 'str': _file = StringIO() elif file is None: _file = sys.stdout else: _file = file done = dict() results_to_print = [] order = [] if isinstance(obj, (list, tuple)): lobj = obj else: lobj = [obj] for obj in lobj: if isinstance(obj, gof.Variable): results_to_print.append(obj) elif isinstance(obj, gof.Apply): results_to_print.extend(obj.outputs) elif isinstance(obj, Function): results_to_print.extend(obj.maker.fgraph.outputs) order = obj.maker.fgraph.toposort() elif isinstance(obj, gof.FunctionGraph): results_to_print.extend(obj.outputs) order = obj.toposort() elif isinstance(obj, (int, long, float, numpy.ndarray)): print obj elif isinstance(obj, (theano.In, theano.Out)): results_to_print.append(obj.variable) else: raise TypeError("debugprint cannot print an object of this type", obj) scan_ops = [] for r in results_to_print: # Add the parent scan op to the list as well if (hasattr(r.owner, 'op') and isinstance(r.owner.op, theano.scan_module.scan_op.Scan)): scan_ops.append(r) debugmode.debugprint(r, depth=depth, done=done, print_type=print_type, file=_file, order=order, ids=ids, scan_ops=scan_ops, stop_on_name=stop_on_name) if len(scan_ops) > 0: print >> file, "" new_prefix = ' >' new_prefix_child = ' >' print >> file, "Inner graphs of the scan ops:" for s in scan_ops: print >> file, "" debugmode.debugprint(s, depth=depth, done=done, print_type=print_type, file=_file, ids=ids, scan_ops=scan_ops, stop_on_name=stop_on_name) if hasattr(s.owner.op, 'fn'): # If the op was compiled, print the optimized version. outputs = s.owner.op.fn.maker.fgraph.outputs else: outputs = s.owner.op.output for idx, i in enumerate(outputs): if hasattr(i, 'owner') and hasattr(i.owner, 'op'): if isinstance(i.owner.op, theano.scan_module.scan_op.Scan): scan_ops.append(i) debugmode.debugprint(r=i, prefix=new_prefix, depth=depth, done=done, print_type=print_type, file=file, ids=ids, stop_on_name=stop_on_name, prefix_child=new_prefix_child, scan_ops=scan_ops) if file is _file: return file elif file == 'str': return _file.getvalue() else: _file.flush()
def c_extract(self, name, sub, check_input=True, check_broadcast=True): sio = StringIO() fail = sub['fail'] nd = self.ndim print >> sio, """ assert(py_%(name)s->ob_refcnt >= 2); // There should be at least one ref from the container object, // and one ref from the local scope. if (CudaNdarray_Check(py_%(name)s)) { //fprintf(stderr, "c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt)); %(name)s = (CudaNdarray*)py_%(name)s; //std::cerr << "c_extract " << %(name)s << '\\n'; """ % locals() if (check_input): print >> sio, """ if (%(name)s->nd != %(nd)s) { PyErr_Format(PyExc_RuntimeError, "c_extract: Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s", %(name)s->nd); %(name)s = NULL; %(fail)s; } //std::cerr << "c_extract " << %(name)s << " nd check passed\\n"; """ % locals() for i, b in enumerate(self.broadcastable): if b and check_broadcast: print >> sio, """ if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1) { PyErr_Format(PyExc_RuntimeError, "c_extract: Some CudaNdarray has dim %%i on broadcastable dimension %%i", CudaNdarray_HOST_DIMS(%(name)s)[%(i)s], %(i)s); %(name)s = NULL; %(fail)s; } //std::cerr << "c_extract " << %(name)s << "dim check %(i)s passed\\n"; //std::cerr << "c_extract " << %(name)s << "checking bcast %(i)s <" << %(name)s->str<< ">\\n"; //std::cerr << "c_extract " << %(name)s->str[%(i)s] << "\\n"; if (CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s]) { //std::cerr << "c_extract bad stride detected...\\n"; PyErr_Format(PyExc_RuntimeError, "c_extract: Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s], %(i)s); %(name)s = NULL; %(fail)s; } //std::cerr << "c_extract " << %(name)s << "bcast check %(i)s passed\\n"; """ % locals() print >> sio, """ assert(%(name)s); Py_INCREF(py_%(name)s); } else if (py_%(name)s == Py_None) { PyErr_SetString(PyExc_TypeError, "expected a CudaNdarray, not None"); %(name)s = NULL; %(fail)s; } else { //fprintf(stderr, "FAILING c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt)); PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray"); %(name)s = NULL; %(fail)s; } //std::cerr << "c_extract done " << %(name)s << '\\n'; """ % locals() else: print >> sio, """ assert(%(name)s); Py_INCREF(py_%(name)s); } """ % locals() #print sio.getvalue() return sio.getvalue()
def debugprint(obj, depth=-1, print_type=False, file=None, ids='CHAR', stop_on_name=False): """Print a computation graph as text to stdout or a file. :type obj: Variable, Apply, or Function instance :param obj: symbolic thing to print :type depth: integer :param depth: print graph to this depth (-1 for unlimited) :type print_type: boolean :param print_type: whether to print the type of printed objects :type file: None, 'str', or file-like object :param file: print to this file ('str' means to return a string) :type ids: str :param ids: How do we print the identifier of the variable id - print the python id value int - print integer character CHAR - print capital character "" - don't print an identifier :param stop_on_name: When True, if a node in the graph has a name, we don't print anything below it. :returns: string if `file` == 'str', else file arg Each line printed represents a Variable in the graph. The indentation of lines corresponds to its depth in the symbolic graph. The first part of the text identifies whether it is an input (if a name or type is printed) or the output of some Apply (in which case the Op is printed). The second part of the text is an identifier of the Variable. If print_type is True, we add a part containing the type of the Variable If a Variable is encountered multiple times in the depth-first search, it is only printed recursively the first time. Later, just the Variable identifier is printed. If an Apply has multiple outputs, then a '.N' suffix will be appended to the Apply's identifier, to indicate which output a line corresponds to. """ if file == 'str': _file = StringIO() elif file is None: _file = sys.stdout else: _file = file done = dict() results_to_print = [] order = [] if isinstance(obj, gof.Variable): results_to_print.append(obj) elif isinstance(obj, gof.Apply): results_to_print.extend(obj.outputs) elif isinstance(obj, Function): results_to_print.extend(obj.maker.fgraph.outputs) order = obj.maker.fgraph.toposort() elif isinstance(obj, (list, tuple)): results_to_print.extend(obj) elif isinstance(obj, gof.FunctionGraph): results_to_print.extend(obj.outputs) order = obj.toposort() elif isinstance(obj, (int, long, float, numpy.ndarray)): print obj else: raise TypeError("debugprint cannot print an object of this type", obj) for r in results_to_print: debugmode.debugprint(r, depth=depth, done=done, print_type=print_type, file=_file, order=order, ids=ids, stop_on_name=stop_on_name) if file is _file: return file elif file == 'str': return _file.getvalue() else: _file.flush()
def test_debugprint(): A = tensor.matrix(name='A') B = tensor.matrix(name='B') C = A + B C.name = 'C' D = tensor.matrix(name='D') E = tensor.matrix(name='E') F = D + E G = C + F # just test that it work debugprint(G) # test ids=int s = StringIO() debugprint(G, file=s, ids='int') s = s.getvalue() # The additional white space are needed! reference = '\n'.join([ "Elemwise{add,no_inplace} [@0] '' ", " |Elemwise{add,no_inplace} [@1] 'C' ", " | |A [@2]", " | |B [@3]", " |Elemwise{add,no_inplace} [@4] '' ", " |D [@5]", " |E [@6]", ]) + '\n' if s != reference: print('--' + s + '--') print('--' + reference + '--') assert s == reference # test ids=CHAR s = StringIO() debugprint(G, file=s, ids='CHAR') s = s.getvalue() # The additional white space are needed! reference = "\n".join([ "Elemwise{add,no_inplace} [@A] '' ", " |Elemwise{add,no_inplace} [@B] 'C' ", " | |A [@C]", " | |B [@D]", " |Elemwise{add,no_inplace} [@E] '' ", " |D [@F]", " |E [@G]", ]) + '\n' if s != reference: print('--' + s + '--') print('--' + reference + '--') assert s == reference # test ids=CHAR, stop_on_name=True s = StringIO() debugprint(G, file=s, ids='CHAR', stop_on_name=True) s = s.getvalue() # The additional white space are needed! reference = '\n'.join([ "Elemwise{add,no_inplace} [@A] '' ", " |Elemwise{add,no_inplace} [@B] 'C' ", " |Elemwise{add,no_inplace} [@C] '' ", " |D [@D]", " |E [@E]", ]) + '\n' if s != reference: print('--' + s + '--') print('--' + reference + '--') assert s == reference # test ids= s = StringIO() debugprint(G, file=s, ids='') s = s.getvalue() # The additional white space are needed! reference = '\n'.join([ "Elemwise{add,no_inplace} '' ", " |Elemwise{add,no_inplace} 'C' ", " | |A ", " | |B ", " |Elemwise{add,no_inplace} '' ", " |D ", " |E ", ]) + '\n' if s != reference: print('--' + s + '--') print('--' + reference + '--') assert s == reference
def c_code(self, node, nodename, inputs, outputs, sub): d = dict(sub) nd = node.outputs[0].type.ndim d.update(locals()) sio = StringIO() nin = len(inputs) nout = len(outputs) fail = sub['fail'] opname = str(self.scalar_op) initial_dims = ','.join('1' for i in xrange(nd)) if 1 or self.scalar_op == scalar.pow: print >> sio, """ //std::cerr << "C_CODE %(opname)s START\\n"; //standard elemwise size checks """ % locals() if nd > 0: print >> sio, """ int dims[%(nd)s] = {%(initial_dims)s}; """ % locals() else: print >> sio, """ int *dims = NULL; """ #check that all inputs have valid dimensions emitted_inames = {} for id, iname in enumerate(inputs): if iname in emitted_inames: assert emitted_inames[iname] is node.inputs[id] continue # with python 2.4 (at least), if a broadcastable pattern is made of # numpy.bool_ instead of bool, calling int() once is not enough. broadcasts = map(int, map(int, node.inputs[id].broadcastable)) broadcasts = ', '.join(map(str, broadcasts)) nd = node.inputs[id].ndim if nd > 0: print >> sio, """ int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; """ % locals() else: print >> sio, """ int *broadcasts_%(iname)s = NULL; """ % locals() emitted_inames[iname] = node.inputs[id] #check that all inputs have valid dimensions emitted_inames = {} for id, iname in enumerate(inputs): if iname in emitted_inames: continue print >> sio, """ //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n"; if (%(nd)s != %(iname)s->nd) { PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd); %(fail)s; } for (int i = 0; i< %(nd)s; ++i) { dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i]; if ((!(broadcasts_%(iname)s[i] && CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1)) && (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i])) { //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n"; PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. Input" " %(id)d (indices start at 0) has shape[%%i] == %%i" ", but the output's size on that axis is %%i.", i, CudaNdarray_HOST_DIMS(%(iname)s)[i], dims[i] ); %(fail)s; } } """ % locals() emitted_inames[iname] = True #check that all outputs have valid dimensions for idx, oname in enumerate(outputs): if idx not in self.inplace_pattern.keys(): print >> sio, """ for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) { Py_DECREF(%(oname)s); %(oname)s = NULL; } } if (%(oname)s && !CudaNdarray_is_c_contiguous(%(oname)s)) { Py_XDECREF(%(oname)s); %(oname)s = NULL; } if (NULL == %(oname)s) { %(oname)s = (CudaNdarray*)CudaNdarray_New(); if (!%(oname)s) { //error string already set %(fail)s; } if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims)) { //error string already set Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() else: input_idx = self.inplace_pattern[idx] iname = inputs[input_idx] print >> sio, """ Py_XDECREF(%(oname)s); %(oname)s = %(iname)s; Py_INCREF(%(oname)s); for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) { PyErr_Format(PyExc_ValueError, "GpuElemwise. Output dimension mis-match. Output" " %(idx)d (indices start at 0), working inplace" " on input %(input_idx)s, has shape[%%i] == %%i" ", but the output's size on that axis is %%i.", i, CudaNdarray_HOST_DIMS(%(oname)s)[i], dims[i] ); Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() print >> sio, """ { //new block so that failure gotos don't skip over variable initialization //std::cerr << "calling callkernel\\n"; if (callkernel_%(nodename)s(1, 0, dims """ % locals() for iname in inputs: print >> sio, """ , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s) """ % locals() for oname in outputs: print >> sio, """ , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s) """ % locals() print >> sio, """ )) { // error """ for oname in outputs: print >> sio, """ Py_DECREF(%(oname)s); %(oname)s = NULL; """ % locals() print >> sio, """ %(fail)s; } else // no error { } } //std::cerr << "C_CODE %(opname)s END\\n"; """ % locals() #print sio.getvalue() return sio.getvalue()
def c_src_kernel(self, node, nodename, nd): sio = StringIO() #print 'C_SRC_KERNEL', sio.getvalue() for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % ( self.scalar_op.__class__.__name__, nodename, nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(node.inputs): s = ", ".join(["const float * i%i_data" % ipos] + ["int i%i_str_%i" % (ipos, d) for d in xrange(nd)]) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(node.outputs): s = ", ".join(["float * o%i_data" % ipos] + ["int o%i_str_%i" % (ipos, d) for d in xrange(nd)]) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;" print >> sio, " const int numThreads = blockDim.x * gridDim.x;" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable for ipos, i in enumerate(node.inputs): if _logical_scalar(i): print >> sio, " const float ii_i%i_value = i%i_data[0];" % ( ipos, ipos) #loop over the elements to be treated by this kernel call print >> sio, " for (int i = idx; i < numEls; i += numThreads) {" # calculate the data pointers for all arguments print >> sio, " int ii = i;" for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " const float * ii_i%i_data = i%i_data;" % ( ipos, ipos) for ipos, i in enumerate(node.outputs): print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos) for d in xrange(nd - 1, -1, -1): if d > 0: print >> sio, " int pos%i = ii %% dim%i;" % (d, d) print >> sio, " ii = ii / dim%i;" % d else: print >> sio, " int pos%i = ii;" % d for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % ( ipos, d, ipos, d) for ipos, i in enumerate(node.outputs): print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % ( ipos, d, ipos, d) # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? task_code = self.scalar_op.c_code( Apply(self.scalar_op, [ scalar.Scalar(dtype=input.type.dtype).make_variable() for input in node.inputs ], [ scalar.Scalar(dtype=output.type.dtype).make_variable() for output in node.outputs ]), nodename + '_scalar_', get_str_list_logical_scalar(node), ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)], sub=dict(fail='return;')) # TODO: set a failure code somehow!!! print >> sio, " ", task_code print >> sio, " }" #indent = " "*(4*d+7) #for ipos, i in enumerate(node.inputs): #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' print >> sio, "}" #print sio.getvalue() return sio.getvalue()
def c_src_callkernel(self, node, nodename): # # This function serves three main goals: # # The first is stride unpacking: # it accepts input and output arguments as # float * , int* # pairs, and it constructs a kernel function call where inputs and arguments are named # like # float *, int, int, int ... # # The second is to recognize when any dimensions can be collapsed as # being contiguous. That mean that we can merge that dimensions with another # one for all inputs/outputs and have the same retusuls (confusing... read code) # # The thrid is to make a special case for scalar element. We allow the collapsing of them. # In the ccontiguous and not contiguous case, we use registers to lower the number of memory access. #TODO: make a special case for broadcasting, to store the data in shared memory. nd = node.outputs[0].type.ndim nb_inputs = len(node.inputs) nb_outputs = len(node.outputs) d = dict() #input_params and output_params go into the function declaration/definition input_params = ", ".join( "const float * i%i_data, const int * i%i_str" % (ipos, ipos) for ipos in xrange(len(node.inputs))) output_params = ", ".join("float * o%i_data, const int * o%i_str" % (ipos, ipos) for ipos in xrange(len(node.outputs))) #input_args and output_args go into the recursive call. input_args = ", ".join("i%i_data, i%i_str" % (ipos, ipos) for ipos in xrange(len(node.inputs))) output_args = ", ".join("o%i_data, o%i_str" % (ipos, ipos) for ipos in xrange(len(node.outputs))) prod_dims = '*'.join(["dims[%i]" % di for di in xrange(nd)] + ['1']) scalar_op = self.scalar_op.__class__.__name__ sio = StringIO() print >> sio, """ static void can_collapse_%(nodename)s(int nd, const int * dims, const int * strides, int collapse[]) { //can we collapse dims[i] and dims[i-1] for(int i=nd-1;i>0;i--){ if(strides[i]*dims[i]==strides[i-1]){//the dims nd-1 are not strided again dimension nd collapse[i]=1; }else collapse[i]=0; } } """ % locals() print >> sio, """ static int callkernel_%(nodename)s(unsigned int numEls, const int d, const int * dims, %(input_params)s, %(output_params)s) { numEls = %(prod_dims)s; """ % locals() if self.verbose: print >> sio, """ std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n"; """ % locals() print >> sio, 'std::cerr << ' + " << ' ' << ".join( ['" "'] + list("dims[%i]" % di for di in xrange(nd)) + ["'\\n';"]) if self.verbose > 1: for ipos in xrange(len(node.inputs)): print >> sio, """ std::cerr << " %(ipos)s data strides" << """ % locals() + " << ' ' << ".join( ["i%s_data" % ipos] + list("i%s_str[%i]" % (ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''' for ipos in xrange(len(node.outputs)): print >> sio, """ std::cerr << " %(ipos)s data strides" << """ % locals() + " << ' ' << ".join( ["o%s_data" % ipos] + list("o%s_str[%i]" % (ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''' # collapse dimension that are broadcast in all inputs. # need to be done before contiguous collapse as it will break it. # do the dimensions and the strides if nd > 0: print >> sio, "int local_dims[%(nd)s];" % locals() else: print >> sio, "int *local_dims=NULL;" if nb_inputs > 0 and nd > 0: print >> sio, """ int local_str[%(nb_inputs)s][%(nd)s]; int local_ostr[%(nb_outputs)s][%(nd)s]; """ % locals() else: print >> sio, """ int local_str[1][1]; int local_ostr[1][1]; """ print >> sio, """ int nd_collapse = %(nd)s; for(int i=0;i<%(nd)s;i++){//init new dim local_dims[i]=dims[i]; } """ % locals() for ipos in xrange(len(node.inputs)): print >> sio, """ for(int i=0;i<%(nd)s;i++){//init new strides local_str[%(ipos)s][i]=i%(ipos)s_str[i]; } """ % locals() for ipos in xrange(len(node.outputs)): print >> sio, """ for(int i=0;i<%(nd)s;i++){//init new strides local_ostr[%(ipos)s][i]=o%(ipos)s_str[i]; } """ % locals() if self.verbose > 2: print >> sio, 'std::cerr <<"before broadcast collapse\\n";' print >> sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ' print >> sio, 'std::cerr << "local_dims";' for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals( ) print >> sio, 'std::cerr << "\\n";' for ipos in xrange(len(node.inputs)): print >> sio, 'std::cerr << " local_str inputs %(ipos)s: " <<'%locals() + \ ' << " " << '.join(["local_str[%s][%s]"% (ipos, x) for x in xrange(nd)])+'<<"\\n";' for ipos in xrange(len(node.outputs)): print >> sio, 'std::cerr << " local_ostr inputs %(ipos)s: " <<'%locals() + \ ' << " " << '.join(["local_ostr[%s][%s]"% (ipos, x) for x in xrange(nd)])+'<<"\\n";' print >> sio, """ for(int id=0;id<nd_collapse;id++){ bool all_broadcast=true; for(int input_id=0;input_id<%(nb_inputs)s;input_id++){ if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false; } for(int input_id=0;input_id<%(nb_outputs)s;input_id++){ if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false; } if(all_broadcast){ for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array local_dims[j-1]=local_dims[j]; for(int input_id=0;input_id<%(nb_inputs)s;input_id++){ for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array local_str[input_id][j-1]=local_str[input_id][j]; } } for(int output_id=0;output_id<%(nb_outputs)s;output_id++){ for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array local_ostr[output_id][j-1]=local_ostr[output_id][j]; } } nd_collapse--; id--; } } """ % locals() if self.verbose > 2: print >> sio, 'std::cerr <<"after broadcast collapse\\n";' print >> sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ' print >> sio, 'std::cerr << "local_dims";' for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals( ) print >> sio, 'std::cerr << "\\n";' for ipos in xrange(len(node.inputs)): print >> sio, 'std::cerr << " local_str %(ipos)s: " <<' % locals( ) + ' << " " << '.join( ["local_str[%s][%s]" % (ipos, x) for x in xrange(nd)]) + '<<"\\n";' for ipos in xrange(len(node.outputs)): print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<' % locals( ) + ' << " " << '.join( ["local_ostr[%s][%s]" % (ipos, x) for x in xrange(nd)]) + '<<"\\n";' # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle)) # this is a good idea because we make less index calculation in the gpu. if nd > 0: print >> sio, "int nd_collapse_[%(nd)s] = {" % locals() + ','.join( ['1' for x in xrange(nd)]) + "};" else: print >> sio, "int *nd_collapse_ = NULL;" for ipos in xrange(len(node.inputs)): if not _logical_scalar(node.inputs[ipos]): if nd > 0: print >> sio, """ int nd_collapse_%(ipos)s[%(nd)s] = {""" % locals( ) + ','.join(['1' for x in xrange(nd)]) + "};" else: print >> sio, """ int *nd_collapse_%(ipos)s = NULL;""" % locals() print >> sio, """ can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s); for(int i=0;i<nd_collapse;i++){ if(nd_collapse_%(ipos)s[i]==0) nd_collapse_[i]=0; } """ % locals() if self.verbose > 1: print >> sio, """ std::cerr<< "nd_collapse_%(ipos)s "<< """ % locals() print >> sio, ' << " " << '.join([ "nd_collapse_%s[" % ipos + str(i) + "]" for i in xrange(nd) ]) print >> sio, '<< "\\n";' # update the local stride. for ipos in xrange(len(node.inputs)): print >> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j]; } } """ % locals() for ipos in xrange(len(node.outputs)): print >> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j]; } } """ % locals() # update the local dims. print >> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_dims[i-1]*=local_dims[i];//set new dims for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array local_dims[j-1]=local_dims[j]; } } """ % locals() #update the new number of dim print >> sio, """ for(int i=1, end=nd_collapse;i<end;i++){ if(nd_collapse_[i]==1)nd_collapse--; } if(nd_collapse == 1 """ % locals() l = [ "local_str[%s][nd_collapse-1]==1 " % ipos for ipos in xrange(len(node.inputs)) if not _logical_scalar(node.inputs[ipos]) ] l += [ "local_ostr[%s][nd_collapse-1]==1 " % ipos for ipos in xrange(len(node.outputs)) if not _logical_scalar(node.outputs[ipos]) ] if len(l) > 0: print >> sio, " && ", " && ".join(l) print >> sio, """){nd_collapse=0;} """ if self.verbose: print >> sio, 'std::cerr <<"after can_collapse\\n";' print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals( ) if self.verbose > 1: for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals( ) print >> sio, 'std::cerr << "\\n";' for ipos in xrange(len(node.inputs)): print >> sio, 'std::cerr << " local_str %(ipos)s: " <<' % locals( ) + ' << " " << '.join( ["local_str[%s][%s]" % (ipos, x) for x in xrange(nd)]) + '<<"\\n";' for ipos in xrange(len(node.outputs)): print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<' % locals( ) + ' << " " << '.join( ["local_ostr[%s][%s]" % (ipos, x) for x in xrange(nd)]) + '<<"\\n";' def launch_Ccontiguous(nodename, scalar_op, sync=True): kernel_call_args = ["numEls"] for ipos in xrange(len(node.inputs)): kernel_call_args.append("i%i_data" % ipos) for ipos in xrange(len(node.outputs)): kernel_call_args.append("o%i_data" % ipos) kernel_call_args = ", ".join(kernel_call_args) verb = "" if self.verbose: verb = 'std::cerr << " Running ccontiguous version\\n";' print >> sio, """ //first use at least a full warp int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE //next start adding multiprocessors int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s); //std::cerr << "calling callkernel returned\\n"; """ % locals() if sync: print >> sio, """ CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n", "GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err), n_blocks, threads_per_block, "kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)"); return -1; } %(verb)s return 0; """ % locals() else: print >> sio, " return 0; " % locals() def launch_General(nodename, scalar_op, force_nd, sync=True): # kernel_call_args are used to invoke the cuda kernel local = "local_" kernel_call_args = ["numEls"] kernel_call_args.extend(local + "dims[%i]" % di for di in xrange(force_nd)) for ipos in xrange(len(node.inputs)): kernel_call_args += ["i%i_data" % ipos] + list( local + "str[%i][%i]" % (ipos, di) for di in xrange(force_nd)) #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) #kernel_call_args.append( "%s, i%i_data" % (strides, ipos)) for ipos in xrange(len(node.outputs)): kernel_call_args += ["o%i_data" % ipos] + list( local + "ostr[%i][%i]" % (ipos, di) for di in xrange(force_nd)) #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) #kernel_call_args.append( "%s, o%i_data" % (strides, ipos)) if self.verbose: print >> sio, """ std::cerr << " Running general version with %(force_nd)s dims\\n"; """ % locals() print >> sio, "std::cerr << " + ' << " " << '.join( kernel_call_args) + ' << "\\n";' #std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n; kernel_call_args = ", ".join(kernel_call_args) print >> sio, """ //first use at least a full warp int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE //next start adding multiprocessors int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); kernel_%(scalar_op)s_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s); """ % locals() if sync: print >> sio, """ CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n", "GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err), n_blocks, threads_per_block, "kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)"); return -1; } return 0; """ % locals() else: print >> sio, " return 0; " % locals() print >> sio, "if(numEls==0) return 0;" print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {" % locals( ) print >> sio, "case 0: {" launch_Ccontiguous(nodename, scalar_op, self.sync) print >> sio, " } break;" for i in xrange(1, nd + 1): print >> sio, "case " + str(i) + ": {" launch_General(nodename, scalar_op, i, self.sync) print >> sio, " } break;" print >> sio, "}" #end case print >> sio, "return -2;" # should not get to this point print >> sio, "}" #end fct #N.B. cudaGetLastError is called by c_code return sio.getvalue()
def c_src_kernel_tiling_less_registers(self, node, nodename): """ The kernel applies to problems with <= 5 dimensions """ nd = node.outputs[0].type.ndim n_in = len(node.inputs) n_out = len(node.outputs) sio = StringIO() if nd not in (2, ): return sio.getvalue() # print some leading comments to make the code easier to read for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % ( self.scalar_op.__class__.__name__, nodename, 'tiling%i_less_registers' % nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(node.inputs): s = ", ".join(["const float * i%i_data_0" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(node.outputs): s = ", ".join(["float * o%i_data_0" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" # TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet. use_shared_stride = False use_shared_limits = False def decl_limits(nd): if use_shared_limits: print >> sio, "__shared__ float * limits[%(nd)s];" % locals() def stride(io, p, d): if use_shared_stride: return "s%s_str[%i][%i]" % (io, p, d) else: return "%s%i_str_%i" % (io, p, d) def limits(d): if use_shared_limits: return "limits[%i]" % d else: return "limits%i" % d def decl_shared_stride(nin, nout, nd): if not use_shared_stride: return print >> sio, """ __shared__ int si_str[%(nin)s][%(nd)s]; __shared__ int so_str[%(nout)s][%(nd)s]; if ((threadIdx.x == 0) && (threadIdx.y == 0)) { """ % locals() for i in xrange(nin): for d in xrange(nd): print >> sio, "si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals( ) for i in xrange(n_out): for d in xrange(nd): print >> sio, "so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals( ) print >> sio, "} __syncthreads();" def calc_limit(d): s = stride('o', 0, d) lname = limits(d) if use_shared_limits: print >> sio, "if ((threadIdx.x == 0) && (threadIdx.y == 0)) {" if d == 0: print >> sio, "%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals( ) else: dm1 = d - 1 print >> sio, "%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals( ) print >> sio, "} __syncthreads();" else: if d == 0: print >> sio, "const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals( ) else: dm1 = d - 1 print >> sio, "const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals( ) def decl_ptrs(d, offset): dm1 = d - 1 assert dm1 >= 0 for i in xrange(n_in): s = stride('i', i, d) print >> sio, "const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals( ) for i in xrange(n_out): s = stride('o', i, d) print >> sio, "float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals( ) def inc_ptrs(d, amt): for i in xrange(n_in): s = stride('i', i, d) print >> sio, "i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals( ) for i in xrange(n_out): s = stride('o', i, d) print >> sio, "o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals( ) def while_limit(d): lname = limits(d) print >> sio, "while (o0_data_%(d)s < %(lname)s) { " % locals() def end_while(d): print >> sio, "}" def task_code(d): print >> sio, self.scalar_op.c_code( Apply(self.scalar_op, [ scalar.Scalar(dtype=input.type.dtype).make_variable() for input in node.inputs ], [ scalar.Scalar(dtype=output.type.dtype).make_variable() for output in node.outputs ]), nodename + '_scalar_', [ 'i%i_data_%i[0]' % (ipos, d) for ipos, i in enumerate(node.inputs) ], [ 'o%i_data_%i[0]' % (ipos, d) for ipos, i in enumerate(node.outputs) ], sub=dict(fail='return;')) #TODO: set a failure code somehow!!! if nd == 4: decl_shared_stride(n_in, n_out, nd) decl_limits(nd) calc_limit(0) inc_ptrs(0, 'blockIdx.x') while_limit(0) if 1: calc_limit(1) decl_ptrs(1, 'blockIdx.y') while_limit(1) if 1: calc_limit(2) decl_ptrs(2, 'threadIdx.y') while_limit(2) if 1: calc_limit(3) decl_ptrs(3, 'threadIdx.x') while_limit(3) if 1: task_code(3) inc_ptrs(3, 'blockDim.x') end_while(3) inc_ptrs(2, 'blockDim.y') end_while(2) inc_ptrs(1, 'gridDim.y') end_while(1) inc_ptrs(0, 'gridDim.x') end_while(0) print >> sio, "}" print sio.getvalue() return sio.getvalue()
def c_code(self, node, name, inputs, outputs, sub): #z_out = alpha * dot(x,y) + beta * z_in #inplace version, set set z_out = z_in #not inplace version, we copy z_in to z_out. z_in, a, x, y = inputs z_out, = outputs inplace = int(self.inplace) fail = sub['fail'] sio = StringIO() print >> sio, """ float %(name)s_alpha = ((dtype_%(a)s*)(%(a)s->data))[0]; if (%(inplace)s && (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] >= 0) && (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] >= 0) && ((CudaNdarray_HOST_DIMS(%(z_in)s)[0] <= 1) || (CudaNdarray_HOST_STRIDES(%(z_in)s)[0] == 1) || (CudaNdarray_HOST_DIMS(%(z_in)s)[1] <= 1) || (CudaNdarray_HOST_STRIDES(%(z_in)s)[1] == 1))) { // The input has an appropriate layout, we work inplace Py_XDECREF(%(z_out)s); %(z_out)s = %(z_in)s; Py_INCREF(%(z_out)s); } else if (%(z_out)s && (%(z_out)s->nd == 2) && (CudaNdarray_HOST_DIMS(%(z_out)s)[0] == CudaNdarray_HOST_DIMS(%(z_in)s)[0]) && (CudaNdarray_HOST_DIMS(%(z_out)s)[1] == CudaNdarray_HOST_DIMS(%(z_in)s)[1]) && (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] >= 0) && (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] >= 0) && ((CudaNdarray_HOST_DIMS(%(z_out)s)[0] <= 1) || (CudaNdarray_HOST_STRIDES(%(z_out)s)[0] == 1) || (CudaNdarray_HOST_DIMS(%(z_out)s)[1] <= 1) || (CudaNdarray_HOST_STRIDES(%(z_out)s)[1] == 1))) { // The existing output has an appropriate layout, // copy the input data into it, then work inplace if (CudaNdarray_CopyFromCudaNdarray(%(z_out)s, %(z_in)s)) { %(fail)s; } } else { // Copy the input, use the copy as output Py_XDECREF(%(z_out)s); %(z_out)s = (CudaNdarray*)CudaNdarray_Copy(%(z_in)s); if (!%(z_out)s) { %(fail)s; } } if (CudaNdarray_sger(%(name)s_alpha, %(x)s, %(y)s, %(z_out)s)) { %(fail)s; } """ return sio.getvalue() % locals()
def c_src_kernel_tiling(self, node, nodename): """ The kernel applies to problems with <= 5 dimensions """ #The kernel is intended to be structured roughly like this: """ static __global__ void kernel() { for (int v = blockIdx.y; v < dim0; v += gridDim.x) { for (int w = blockIdx.y; w < dim1; w += gridDim.y) { for (int x = threadIdx.x; x < dim2; x += blockDim.x) { for (int y = threadIdx.y; y < dim3; y += blockDim.y) { for (int z = threadIdx.z; z < dim4; z += blockDim.z) { out[v * out_stride[0] + ...] = f(in1[...], in2[...]) } } } } } } """ nd = node.outputs[0].type.ndim sio = StringIO() #print 'C_SRC_KERNEL', sio.getvalue() if nd in (4, ): # print some leading comments to make the code easier to read for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % ( self.scalar_op.__class__.__name__, nodename, 'tiling%i' % nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) #declare inputs for ipos, i in enumerate(node.inputs): s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #declare outputs for ipos, i in enumerate(node.outputs): s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable print >> sio, " __shared__ float value0[%i];" % len(node.inputs) print >> sio, " __shared__ int shared_dims[%(nd)s];" % locals() #print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]" print >> sio, " if ((threadIdx.x == 0) && (threadIdx.y == 0)) {" for ipos, i in enumerate(node.inputs): if _logical_scalar(i): print >> sio, " value0[%i] = i%i_data[0];" % (ipos, ipos) for ipos in xrange(nd): print >> sio, " shared_dims[%i] = dim%i;" % (ipos, ipos) print >> sio, " }" print >> sio, " __syncthreads();" if (nd == 4): print >> sio, """ for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x) { for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y) { //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x) for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y) { //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y) for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x) { """ else: raise NotImplementedError() for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " const float * ii_i%i_data = i%i_data;" % ( ipos, ipos) for ipos, i in enumerate(node.outputs): print >> sio, " float * ii_o%i_data = o%i_data;" % ( ipos, ipos) for d in xrange(nd): for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % ( ipos, d, ipos, d) for ipos, i in enumerate(node.outputs): print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % ( ipos, d, ipos, d) # perform the scalar operation on the input and output references #TODO: What if the scalar_op needs support_code?? task_code = self.scalar_op.c_code( Apply(self.scalar_op, [ scalar.Scalar(dtype=input.type.dtype).make_variable() for input in node.inputs ], [ scalar.Scalar(dtype=output.type.dtype).make_variable() for output in node.outputs ]), nodename + '_scalar_', get_str_list_logical_scalar(node, value_str='value0[%i]'), [ 'ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs) ], sub=dict(fail='return;')) #TODO: set a failure code somehow!!! print >> sio, " ", task_code print >> sio, " }" * nd #TODO: insert runtime stride checks that select the best loop order either here, or in # the host code that launched the kernel (host code probably better spot) #indent = " "*(4*d+7) #for ipos, i in enumerate(node.inputs): #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' print >> sio, "}" print sio.getvalue() return sio.getvalue()
def c_src_kernel_tiling(self, node, nodename): """ The kernel applies to problems with <= 5 dimensions """ # The kernel is intended to be structured roughly like this: """ static __global__ void kernel() { for (int v = blockIdx.y; v < dim0; v += gridDim.x) { for (int w = blockIdx.y; w < dim1; w += gridDim.y) { for (int x = threadIdx.x; x < dim2; x += blockDim.x) { for (int y = threadIdx.y; y < dim3; y += blockDim.y) { for (int z = threadIdx.z; z < dim4; z += blockDim.z) { out[v * out_stride[0] + ...] = f(in1[...], in2[...]) } } } } } } """ nd = node.outputs[0].type.ndim sio = StringIO() # print 'C_SRC_KERNEL', sio.getvalue() if nd in (4,): # print some leading comments to make the code easier to read for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % ( self.scalar_op.__class__.__name__, nodename, 'tiling%i'%nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) # declare inputs for ipos, i in enumerate(node.inputs): s = ", ".join(["const float * i%i_data" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s # declare outputs for ipos, i in enumerate(node.outputs): s = ", ".join(["float * o%i_data" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable print >> sio, " __shared__ float value0[%i];" % len(node.inputs) print >> sio, " __shared__ int shared_dims[%(nd)s];" % locals() #print >> sio, " __shared__ int shared_i_str[%(n_in)s][%(nd)s]" print >> sio, " if ((threadIdx.x == 0) && (threadIdx.y == 0)) {" for ipos, i in enumerate(node.inputs): if _logical_scalar(i): print >> sio, " value0[%i] = i%i_data[0];" % (ipos, ipos) for ipos in xrange(nd): print >> sio, " shared_dims[%i] = dim%i;" % (ipos, ipos) print >> sio, " }" print >> sio, " __syncthreads();" if (nd == 4): print >> sio, """ for (int pos0 = blockIdx.x; pos0 < shared_dims[0]; pos0 += gridDim.x) { for (int pos1 = blockIdx.y; pos1 < shared_dims[1]; pos1 += gridDim.y) { //for (int pos2 = threadIdx.x; pos2 < shared_dims[2]; pos2 += blockDim.x) for (int pos2 = threadIdx.y; pos2 < shared_dims[2]; pos2 += blockDim.y) { //for (int pos3 = threadIdx.y; pos3 < shared_dims[3]; pos3 += blockDim.y) for (int pos3 = threadIdx.x; pos3 < shared_dims[3]; pos3 += blockDim.x) { """ else: raise NotImplementedError() for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos) for ipos, i in enumerate(node.outputs): print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos) for d in xrange(nd): for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d) for ipos, i in enumerate(node.outputs): print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d) # perform the scalar operation on the input and output references # TODO: What if the scalar_op needs support_code?? task_code = self.scalar_op.c_code( Apply(self.scalar_op, [scalar.Scalar(dtype=input.type.dtype).make_variable() for input in node.inputs], [scalar.Scalar(dtype=output.type.dtype).make_variable() for output in node.outputs]) , nodename + '_scalar_' , get_str_list_logical_scalar(node, value_str='value0[%i]') , ['ii_o%i_data[0]'%ipos for ipos, i in enumerate(node.outputs)] , sub=dict(fail='return;')) # TODO: set a failure code somehow!!! print >> sio, " ", task_code print >> sio, " }" * nd # TODO: insert runtime stride checks that select the best loop order either here, or in # the host code that launched the kernel (host code probably better spot) #indent = " "*(4*d+7) # for ipos, i in enumerate(node.inputs): #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' print >> sio, "}" print sio.getvalue() return sio.getvalue()
def c_extract(self, name, sub, check_input=True): sio = StringIO() fail = sub['fail'] nd = self.ndim print >> sio, """ assert(py_%(name)s->ob_refcnt >= 2); // There should be at least one ref from the container object, // and one ref from the local scope. if (CudaNdarray_Check(py_%(name)s)) { //fprintf(stderr, "c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt)); %(name)s = (CudaNdarray*)py_%(name)s; //std::cerr << "c_extract " << %(name)s << '\\n'; """ % locals() if(check_input): print >> sio, """ if (%(name)s->nd != %(nd)s) { PyErr_Format(PyExc_RuntimeError, "c_extract: Some CudaNdarray has rank %%i, it was supposed to have rank %(nd)s", %(name)s->nd); %(name)s = NULL; %(fail)s; } //std::cerr << "c_extract " << %(name)s << " nd check passed\\n"; """ % locals() for i, b in enumerate(self.broadcastable): if b: print >> sio, """ if (CudaNdarray_HOST_DIMS(%(name)s)[%(i)s] != 1) { PyErr_Format(PyExc_RuntimeError, "c_extract: Some CudaNdarray has dim %%i on broadcastable dimension %%i", CudaNdarray_HOST_DIMS(%(name)s)[%(i)s], %(i)s); %(name)s = NULL; %(fail)s; } //std::cerr << "c_extract " << %(name)s << "dim check %(i)s passed\\n"; //std::cerr << "c_extract " << %(name)s << "checking bcast %(i)s <" << %(name)s->str<< ">\\n"; //std::cerr << "c_extract " << %(name)s->str[%(i)s] << "\\n"; if (CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s]) { //std::cerr << "c_extract bad stride detected...\\n"; PyErr_Format(PyExc_RuntimeError, "c_extract: Some CudaNdarray has a nonzero stride %%i on a broadcastable dimension %%i", CudaNdarray_HOST_STRIDES(%(name)s)[%(i)s], %(i)s); %(name)s = NULL; %(fail)s; } //std::cerr << "c_extract " << %(name)s << "bcast check %(i)s passed\\n"; """ % locals() print >> sio, """ assert(%(name)s); Py_INCREF(py_%(name)s); } else if (py_%(name)s == Py_None) { PyErr_SetString(PyExc_TypeError, "expected a CudaNdarray, not None"); %(name)s = NULL; %(fail)s; } else { //fprintf(stderr, "FAILING c_extract CNDA object w refcnt %%p %%i\\n", py_%(name)s, (py_%(name)s->ob_refcnt)); PyErr_SetString(PyExc_TypeError, "Argument not a CudaNdarray"); %(name)s = NULL; %(fail)s; } //std::cerr << "c_extract done " << %(name)s << '\\n'; """ % locals() else: print >> sio, """ assert(%(name)s); Py_INCREF(py_%(name)s); } """ % locals() #print sio.getvalue() return sio.getvalue()
def c_src_kernel_tiling_less_registers(self, node, nodename): """ The kernel applies to problems with <= 5 dimensions """ nd = node.outputs[0].type.ndim n_in = len(node.inputs) n_out = len(node.outputs) sio = StringIO() if nd not in (2,): return sio.getvalue() # print some leading comments to make the code easier to read for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % ( self.scalar_op.__class__.__name__, nodename, 'tiling%i_less_registers'%nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) # declare inputs for ipos, i in enumerate(node.inputs): s = ", ".join(["const float * i%i_data_0" % ipos] + list("int i%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s # declare outputs for ipos, i in enumerate(node.outputs): s = ", ".join(["float * o%i_data_0" % ipos] + list("int o%i_str_%i" % (ipos, d) for d in xrange(nd))) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" # TODO: Setting these to true makes the function fail SOMETIMES. I don't know why yet. use_shared_stride = False use_shared_limits = False def decl_limits(nd): if use_shared_limits: print >> sio, "__shared__ float * limits[%(nd)s];" % locals() def stride(io, p, d): if use_shared_stride: return "s%s_str[%i][%i]" % (io, p, d) else: return "%s%i_str_%i" % (io, p, d) def limits(d): if use_shared_limits: return "limits[%i]" % d else: return "limits%i" % d def decl_shared_stride(nin, nout, nd): if not use_shared_stride: return print >> sio, """ __shared__ int si_str[%(nin)s][%(nd)s]; __shared__ int so_str[%(nout)s][%(nd)s]; if ((threadIdx.x == 0) && (threadIdx.y == 0)) { """ % locals() for i in xrange(nin): for d in xrange(nd): print >> sio, "si_str[%(i)s][%(d)s] = i%(i)s_str_%(d)s;" % locals() for i in xrange(n_out): for d in xrange(nd): print >> sio, "so_str[%(i)s][%(d)s] = o%(i)s_str_%(d)s;" % locals() print >> sio, "} __syncthreads();" def calc_limit(d): s = stride('o', 0, d) lname = limits(d) if use_shared_limits: print >> sio, "if ((threadIdx.x == 0) && (threadIdx.y == 0)) {" if d == 0: print >> sio, "%(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals() else: dm1 = d - 1 print >> sio, "%(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals() print >> sio, "} __syncthreads();" else: if d == 0: print >> sio, "const float * %(lname)s = o0_data_0 + dim%(d)s * %(s)s;" % locals() else: dm1 = d - 1 print >> sio, "const float * %(lname)s = o0_data_%(dm1)s + dim%(d)s * %(s)s;" % locals() def decl_ptrs(d, offset): dm1 = d - 1 assert dm1 >= 0 for i in xrange(n_in): s = stride('i', i, d) print >> sio, "const float * i%(i)s_data_%(d)s = i%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals() for i in xrange(n_out): s = stride('o', i, d) print >> sio, "float * o%(i)s_data_%(d)s = o%(i)s_data_%(dm1)s + %(offset)s * %(s)s;" % locals() def inc_ptrs(d, amt): for i in xrange(n_in): s = stride('i', i, d) print >> sio, "i%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals() for i in xrange(n_out): s = stride('o', i, d) print >> sio, "o%(i)s_data_%(d)s += %(amt)s * %(s)s;" % locals() def while_limit(d): lname = limits(d) print >> sio, "while (o0_data_%(d)s < %(lname)s) { " % locals() def end_while(d): print >> sio, "}" def task_code(d): print >> sio, self.scalar_op.c_code( Apply(self.scalar_op, [scalar.Scalar(dtype=input.type.dtype).make_variable() for input in node.inputs], [scalar.Scalar(dtype=output.type.dtype).make_variable() for output in node.outputs]) , nodename + '_scalar_' , ['i%i_data_%i[0]'%(ipos, d) for ipos, i in enumerate(node.inputs)] , ['o%i_data_%i[0]'%(ipos, d) for ipos, i in enumerate(node.outputs)] , sub=dict(fail='return;')) # TODO: set a failure code somehow!!! if nd == 4: decl_shared_stride(n_in, n_out, nd) decl_limits(nd) calc_limit(0) inc_ptrs(0, 'blockIdx.x') while_limit(0) if 1: calc_limit(1) decl_ptrs(1, 'blockIdx.y') while_limit(1) if 1: calc_limit(2) decl_ptrs(2, 'threadIdx.y') while_limit(2) if 1: calc_limit(3) decl_ptrs(3, 'threadIdx.x') while_limit(3) if 1: task_code(3) inc_ptrs(3, 'blockDim.x') end_while(3) inc_ptrs(2, 'blockDim.y') end_while(2) inc_ptrs(1, 'gridDim.y') end_while(1) inc_ptrs(0, 'gridDim.x') end_while(0) print >> sio, "}" print sio.getvalue() return sio.getvalue()
def __str__(self): sio = StringIO() _config_print(self.__class__, sio) return sio.getvalue()
def test_debugprint(): A = tensor.matrix(name='A') B = tensor.matrix(name='B') C = A + B C.name = 'C' D = tensor.matrix(name='D') E = tensor.matrix(name='E') F = D + E G = C + F # just test that it work debugprint(G) # test ids=int s = StringIO() debugprint(G, file=s, ids='int') s = s.getvalue() # The additional white space are needed! reference = """Elemwise{add,no_inplace} [@0] '' |Elemwise{add,no_inplace} [@1] 'C' | |A [@2] | |B [@3] |Elemwise{add,no_inplace} [@4] '' |D [@5] |E [@6] """ if s != reference: print '--' + s + '--' print '--' + reference + '--' assert s == reference # test ids=CHAR s = StringIO() debugprint(G, file=s, ids='CHAR') s = s.getvalue() # The additional white space are needed! reference = """Elemwise{add,no_inplace} [@A] '' |Elemwise{add,no_inplace} [@B] 'C' | |A [@C] | |B [@D] |Elemwise{add,no_inplace} [@E] '' |D [@F] |E [@G] """ if s != reference: print '--' + s + '--' print '--' + reference + '--' assert s == reference # test ids=CHAR, stop_on_name=True s = StringIO() debugprint(G, file=s, ids='CHAR', stop_on_name=True) s = s.getvalue() # The additional white space are needed! reference = """Elemwise{add,no_inplace} [@A] '' |Elemwise{add,no_inplace} [@B] 'C' |Elemwise{add,no_inplace} [@C] '' |D [@D] |E [@E] """ if s != reference: print '--' + s + '--' print '--' + reference + '--' assert s == reference # test ids= s = StringIO() debugprint(G, file=s, ids='') s = s.getvalue() # The additional white space are needed! reference = """Elemwise{add,no_inplace} '' |Elemwise{add,no_inplace} 'C' | |A | |B |Elemwise{add,no_inplace} '' |D |E """ if s != reference: print '--' + s + '--' print '--' + reference + '--' assert s == reference
def __str__(self): sio = StringIO() self.print_summary(sio) return sio.getvalue()
def c_src_callkernel(self, node, nodename): # # This function serves three main goals: # # The first is stride unpacking: # it accepts input and output arguments as # float * , int* # pairs, and it constructs a kernel function call where inputs and arguments are named # like # float *, int, int, int ... # # The second is to recognize when any dimensions can be collapsed as # being contiguous. That mean that we can merge that dimensions with another # one for all inputs/outputs and have the same retusuls (confusing... read code) # # The thrid is to make a special case for scalar element. We allow the collapsing of them. # In the ccontiguous and not contiguous case, we use registers to lower the number of memory access. # TODO: make a special case for broadcasting, to store the data in shared memory. nd = node.outputs[0].type.ndim nb_inputs = len(node.inputs) nb_outputs = len(node.outputs) d = dict() # input_params and output_params go into the function declaration/definition input_params = ", ".join("const float * i%i_data, const int * i%i_str"%(ipos, ipos) for ipos in xrange(len(node.inputs))) output_params = ", ".join("float * o%i_data, const int * o%i_str"%(ipos, ipos) for ipos in xrange(len(node.outputs))) # input_args and output_args go into the recursive call. input_args = ", ".join("i%i_data, i%i_str"%(ipos, ipos) for ipos in xrange(len(node.inputs))) output_args = ", ".join("o%i_data, o%i_str"%(ipos, ipos) for ipos in xrange(len(node.outputs))) prod_dims = '*'.join(["dims[%i]"%di for di in xrange(nd)]+['1']) scalar_op = self.scalar_op.__class__.__name__ sio = StringIO() print >> sio, """ static void can_collapse_%(nodename)s(int nd, const int * dims, const int * strides, int collapse[]) { //can we collapse dims[i] and dims[i-1] for(int i=nd-1;i>0;i--){ if(strides[i]*dims[i]==strides[i-1]){//the dims nd-1 are not strided again dimension nd collapse[i]=1; }else collapse[i]=0; } } """ % locals() print >> sio, """ static int callkernel_%(nodename)s(unsigned int numEls, const int d, const int * dims, %(input_params)s, %(output_params)s) { numEls = %(prod_dims)s; """ % locals() if self.verbose: print >> sio, """ std::cerr << "calling kernel_%(scalar_op)s_%(nodename)s w numEls" << numEls << " dims"<< d << "\\n"; """ % locals() print >> sio, 'std::cerr << ' + " << ' ' << ".join(['" "']+list("dims[%i]"%di for di in xrange(nd)) + ["'\\n';"]) if self.verbose > 1: for ipos in xrange(len(node.inputs)): print >> sio, """ std::cerr << " %(ipos)s data strides" << """ % locals() + " << ' ' << ".join(["i%s_data"%ipos] + list("i%s_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''' for ipos in xrange(len(node.outputs)): print >> sio, """ std::cerr << " %(ipos)s data strides" << """ % locals() + " << ' ' << ".join(["o%s_data"%ipos] + list("o%s_str[%i]"%(ipos, di) for di in xrange(nd))) + ''' << "\\n"; ''' # collapse dimension that are broadcast in all inputs. # need to be done before contiguous collapse as it will break it. # do the dimensions and the strides if nd > 0: print >> sio, "int local_dims[%(nd)s];" % locals() else: print >> sio, "int *local_dims=NULL;" if nb_inputs > 0 and nd > 0: print >> sio, """ int local_str[%(nb_inputs)s][%(nd)s]; int local_ostr[%(nb_outputs)s][%(nd)s]; """ % locals() else: print >> sio, """ int local_str[1][1]; int local_ostr[1][1]; """ print >> sio, """ int nd_collapse = %(nd)s; for(int i=0;i<%(nd)s;i++){//init new dim local_dims[i]=dims[i]; } """ % locals() for ipos in xrange(len(node.inputs)): print >> sio, """ for(int i=0;i<%(nd)s;i++){//init new strides local_str[%(ipos)s][i]=i%(ipos)s_str[i]; } """ % locals() for ipos in xrange(len(node.outputs)): print >> sio, """ for(int i=0;i<%(nd)s;i++){//init new strides local_ostr[%(ipos)s][i]=o%(ipos)s_str[i]; } """ % locals() if self.verbose > 2: print >>sio, 'std::cerr <<"before broadcast collapse\\n";' print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ' print >> sio, 'std::cerr << "local_dims";' for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; ' % locals() print >> sio, 'std::cerr << "\\n";' if nd > 0: for ipos in xrange(len(node.inputs)): print >> sio, 'std::cerr << " local_str inputs %(ipos)s: " <<'%locals() + \ ' << " " << '.join(["local_str[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";' for ipos in xrange(len(node.outputs)): print >> sio, 'std::cerr << " local_ostr inputs %(ipos)s: " <<'%locals() + \ ' << " " << '.join(["local_ostr[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";' print >> sio, """ for(int id=0;id<nd_collapse;id++){ bool all_broadcast=true; for(int input_id=0;input_id<%(nb_inputs)s;input_id++){ if(local_str[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false; } for(int input_id=0;input_id<%(nb_outputs)s;input_id++){ if(local_ostr[input_id][id]!=0 || local_dims[id]!=1) all_broadcast= false; } if(all_broadcast){ for(int j=id+1;j<nd_collapse;j++)//remove dims i from the array local_dims[j-1]=local_dims[j]; for(int input_id=0;input_id<%(nb_inputs)s;input_id++){ for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array local_str[input_id][j-1]=local_str[input_id][j]; } } for(int output_id=0;output_id<%(nb_outputs)s;output_id++){ for(int j=id+1;j<nd_collapse;j++){//remove dims i from the array local_ostr[output_id][j-1]=local_ostr[output_id][j]; } } nd_collapse--; id--; } } """%locals() if self.verbose > 2: print >>sio, 'std::cerr <<"after broadcast collapse\\n";' print >>sio, 'std::cerr<< "nd_collapse "<< nd_collapse << "\\n"; ' print >> sio, 'std::cerr << "local_dims";' for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals() print >> sio, 'std::cerr << "\\n";' if nd > 0: for ipos in xrange(len(node.inputs)): print >> sio, 'std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";' for ipos in xrange(len(node.outputs)): print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%s][%s]" % (ipos, x) for x in xrange(nd)])+'<<"\\n";' # collapse contiguous dimensions (ignoring scalars, generic version(collapse any dimensions, right, left, middle)) # this is a good idea because we make less index calculation in the gpu. if nd > 0: print >> sio, "int nd_collapse_[%(nd)s] = {"%locals() + ','.join(['1' for x in xrange(nd)]) + "};" else: print >> sio, "int *nd_collapse_ = NULL;" for ipos in xrange(len(node.inputs)): if not _logical_scalar(node.inputs[ipos]): if nd > 0: print >> sio, """ int nd_collapse_%(ipos)s[%(nd)s] = {"""%locals() + ','.join(['1' for x in xrange(nd)]) + "};" else: print >> sio, """ int *nd_collapse_%(ipos)s = NULL;"""%locals() print >> sio, """ can_collapse_%(nodename)s(nd_collapse, local_dims, local_str[%(ipos)s], nd_collapse_%(ipos)s); for(int i=0;i<nd_collapse;i++){ if(nd_collapse_%(ipos)s[i]==0) nd_collapse_[i]=0; } """ % locals() if self.verbose > 1: print >>sio, """ std::cerr<< "nd_collapse_%(ipos)s "<< """%locals() print >>sio, ' << " " << '.join(["nd_collapse_%s[" % ipos + str(i)+"]" for i in xrange(nd)]) print >>sio, '<< "\\n";' # update the local stride. for ipos in xrange(len(node.inputs)): print >> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_str[%(ipos)s][i-1]=local_str[%(ipos)s][i];//set new strides for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array local_str[%(ipos)s][j-1]=local_str[%(ipos)s][j]; } } """%locals() for ipos in xrange(len(node.outputs)): print >> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_ostr[%(ipos)s][i-1]=local_ostr[%(ipos)s][i];//set new strides for(int j=i+1;j<nd_collapse;j++)//remove stride i from the array local_ostr[%(ipos)s][j-1]=local_ostr[%(ipos)s][j]; } } """%locals() # update the local dims. print >> sio, """ for(int i=nd_collapse-1;i>0;i--){ if(nd_collapse_[i]==1){ local_dims[i-1]*=local_dims[i];//set new dims for(int j=i+1;j<nd_collapse;j++)//remove dims i from the array local_dims[j-1]=local_dims[j]; } } """%locals() # update the new number of dim print >> sio, """ for(int i=1, end=nd_collapse;i<end;i++){ if(nd_collapse_[i]==1)nd_collapse--; } if(nd_collapse == 1 """%locals() l = ["local_str[%s][nd_collapse-1]==1 "%ipos for ipos in xrange(len(node.inputs)) if not _logical_scalar(node.inputs[ipos])] l += ["local_ostr[%s][nd_collapse-1]==1 "%ipos for ipos in xrange(len(node.outputs)) if not _logical_scalar(node.outputs[ipos])] if len(l) > 0: print >> sio, " && ", " && ".join(l) print >> sio, """){nd_collapse=0;} """ if self.verbose: print >> sio, 'std::cerr <<"after can_collapse\\n";' print >> sio, """std::cerr << "nd_collapse " << nd_collapse << "\\n"; """ % locals() if self.verbose > 1: for d in xrange(nd): print >> sio, 'std::cerr << " " << local_dims[%(d)s]; '%locals() print >> sio, 'std::cerr << "\\n";' if nd > 0: for ipos in xrange(len(node.inputs)): print >> sio, 'std::cerr << " local_str %(ipos)s: " <<'%locals()+' << " " << '.join(["local_str[%s][%s]"%(ipos, x) for x in xrange(nd)])+'<<"\\n";' for ipos in xrange(len(node.outputs)): print >> sio, 'std::cerr << " local_ostr %(ipos)s: " <<'%locals()+' << " " << '.join(["local_ostr[%s][%s]"%(ipos, x) for x in xrange(nd)])+'<<"\\n";' def launch_Ccontiguous(nodename, scalar_op, sync=True): kernel_call_args = ["numEls"] for ipos in xrange(len(node.inputs)): kernel_call_args.append("i%i_data"%ipos) for ipos in xrange(len(node.outputs)): kernel_call_args.append("o%i_data"%ipos) kernel_call_args = ", ".join(kernel_call_args) verb = "" if self.verbose: verb = 'std::cerr << " Running ccontiguous version\\n";' print >> sio, """ //first use at least a full warp int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE //next start adding multiprocessors int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s); //std::cerr << "calling callkernel returned\\n"; """ % locals() if sync: print >> sio, """ CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n", "GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err), n_blocks, threads_per_block, "kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)"); return -1; } %(verb)s return 0; """ % locals() else: print >> sio, " return 0; " % locals() def launch_General(nodename, scalar_op, force_nd, sync=True): # kernel_call_args are used to invoke the cuda kernel local = "local_" kernel_call_args = ["numEls"] kernel_call_args.extend(local+"dims[%i]"%di for di in xrange(force_nd)) for ipos in xrange(len(node.inputs)): kernel_call_args += ["i%i_data"%ipos] + list(local+"str[%i][%i]"%(ipos, di) for di in xrange(force_nd)) #strides = ", ".join("i%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) #kernel_call_args.append( "%s, i%i_data" % (strides, ipos)) for ipos in xrange(len(node.outputs)): kernel_call_args += ["o%i_data"%ipos] + list(local+"ostr[%i][%i]"%(ipos, di) for di in xrange(force_nd)) #strides = ", ".join("o%i_str[%i]"%(ipos, di) for di in xrange(force_nd)) #kernel_call_args.append( "%s, o%i_data" % (strides, ipos)) if self.verbose: print >> sio, """ std::cerr << " Running general version with %(force_nd)s dims\\n"; """%locals() print >> sio, "std::cerr << " + ' << " " << '.join(kernel_call_args)+' << "\\n";' # std::cerr << numEls << dims[0] << i0_data, i0_str[0] << o0_data, o0_str[0]\n; kernel_call_args = ", ".join(kernel_call_args) print >> sio, """ //first use at least a full warp int threads_per_block = std::min(numEls, (unsigned int)32); //WARP SIZE //next start adding multiprocessors int n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (unsigned int)30); // UP TO NUMBER OF MULTIPROCESSORS // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (unsigned int)NUM_VECTOR_OP_THREADS_PER_BLOCK); kernel_%(scalar_op)s_%(nodename)s_%(force_nd)s<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s); """ % locals() if sync: print >> sio, """ CNDA_THREAD_SYNC; cudaError_t err = cudaGetLastError(); if( cudaSuccess != err) { PyErr_Format(PyExc_RuntimeError, "Cuda error: %%s: %%s.\\n n_blocks=%%i threads_per_block=%%i\\n Call: %%s\\n", "GpuElemwise %(nodename)s %(scalar_op)s", cudaGetErrorString(err), n_blocks, threads_per_block, "kernel_%(scalar_op)s_%(nodename)s_Ccontiguous<<<n_blocks, threads_per_block>>>(%(kernel_call_args)s)"); return -1; } return 0; """ % locals() else: print >> sio, " return 0; " % locals() print >> sio, "if(numEls==0) return 0;" print >> sio, "switch (nd_collapse==0?0:min(%(nd)s,nd_collapse)) {"%locals() print >> sio, "case 0: {" launch_Ccontiguous(nodename, scalar_op, self.sync) print >> sio, " } break;" for i in xrange(1, nd+1): print >> sio, "case "+str(i)+": {" launch_General(nodename, scalar_op, i, self.sync) print >> sio, " } break;" print >> sio, "}" # end case print >> sio, "return -2;" # should not get to this point print >> sio, "}" # end fct # N.B. cudaGetLastError is called by c_code return sio.getvalue()
def debugprint(obj, depth=-1, print_type=False, file=None, ids='CHAR', stop_on_name=False): """Print a computation graph as text to stdout or a file. :type obj: Variable, Apply, or Function instance :param obj: symbolic thing to print :type depth: integer :param depth: print graph to this depth (-1 for unlimited) :type print_type: boolean :param print_type: whether to print the type of printed objects :type file: None, 'str', or file-like object :param file: print to this file ('str' means to return a string) :type ids: str :param ids: How do we print the identifier of the variable id - print the python id value int - print integer character CHAR - print capital character "" - don't print an identifier :param stop_on_name: When True, if a node in the graph has a name, we don't print anything below it. :returns: string if `file` == 'str', else file arg Each line printed represents a Variable in the graph. The indentation of lines corresponds to its depth in the symbolic graph. The first part of the text identifies whether it is an input (if a name or type is printed) or the output of some Apply (in which case the Op is printed). The second part of the text is an identifier of the Variable. If print_type is True, we add a part containing the type of the Variable If a Variable is encountered multiple times in the depth-first search, it is only printed recursively the first time. Later, just the Variable identifier is printed. If an Apply has multiple outputs, then a '.N' suffix will be appended to the Apply's identifier, to indicate which output a line corresponds to. """ if file == 'str': _file = StringIO() elif file is None: _file = sys.stdout else: _file = file done = dict() results_to_print = [] order = [] if isinstance(obj, (list, tuple)): lobj = obj else: lobj = [obj] for obj in lobj: if isinstance(obj, gof.Variable): results_to_print.append(obj) elif isinstance(obj, gof.Apply): results_to_print.extend(obj.outputs) elif isinstance(obj, Function): results_to_print.extend(obj.maker.fgraph.outputs) order = obj.maker.fgraph.toposort() elif isinstance(obj, gof.FunctionGraph): results_to_print.extend(obj.outputs) order = obj.toposort() elif isinstance(obj, (int, long, float, numpy.ndarray)): print obj else: raise TypeError("debugprint cannot print an object of this type", obj) scan_ops = [] for r in results_to_print: #Add the parent scan op to the list as well if hasattr(r.owner, 'op') and isinstance(r.owner.op, theano.scan_module.scan_op.Scan): scan_ops.append(r) debugmode.debugprint(r, depth=depth, done=done, print_type=print_type, file=_file, order=order, ids=ids, scan_ops=scan_ops, stop_on_name=stop_on_name) if len(scan_ops) > 0: print >> file, "" new_prefix = ' >' new_prefix_child = ' >' print >> file, "Inner graphs of the scan ops:" for s in scan_ops: print >> file, "" debugmode.debugprint(s, depth=depth, done=done, print_type=print_type, file=_file, ids=ids, scan_ops=scan_ops, stop_on_name=stop_on_name) for idx, i in enumerate(s.owner.op.outputs): if hasattr(i, 'owner') and hasattr(i.owner, 'op'): if isinstance(i.owner.op, theano.scan_module.scan_op.Scan): scan_ops.append(i) debugmode.debugprint(r=i, prefix=new_prefix, depth=depth, done=done, print_type=print_type, file=file, ids=ids, stop_on_name=stop_on_name, prefix_child=new_prefix_child, scan_ops=scan_ops) if file is _file: return file elif file == 'str': return _file.getvalue() else: _file.flush()
def c_src_kernel(self, node, nodename, nd): sio = StringIO() # print 'C_SRC_KERNEL', sio.getvalue() for ipos, i in enumerate(node.inputs): print >> sio, "// Input ", ipos, str(i.type) for ipos, i in enumerate(node.outputs): print >> sio, "// Output ", ipos, str(i.type) print >> sio, "static __global__ void kernel_%s_%s_%s(unsigned int numEls" % ( self.scalar_op.__class__.__name__, nodename, nd) if (nd): print >> sio, "\t,", ", ".join("const int dim%i" % i for i in xrange(nd)) # declare inputs for ipos, i in enumerate(node.inputs): s = ", ".join(["const float * i%i_data" % ipos] + ["int i%i_str_%i" % (ipos, d) for d in xrange(nd)]) print >> sio, "\t,", s # declare outputs for ipos, i in enumerate(node.outputs): s = ", ".join(["float * o%i_data" % ipos] + ["int o%i_str_%i" % (ipos, d) for d in xrange(nd)]) print >> sio, "\t,", s #print >> sio, "\t,", ", ".join("int o%i_str_%i" % (ipos, d) for d in xrange(nd)) #print >> sio, "\t,", "float * o%i_data" % ipos print >> sio, "\t)\n{" print >> sio, " const int idx = blockIdx.x * blockDim.x + threadIdx.x;" print >> sio, " const int numThreads = blockDim.x * gridDim.x;" # For each input that is a scalar which has been broadcasted to a tensor, # load it into a local variable for ipos, i in enumerate(node.inputs): if _logical_scalar(i): print >> sio, " const float ii_i%i_value = i%i_data[0];" % (ipos, ipos) # loop over the elements to be treated by this kernel call print >> sio, " for (int i = idx; i < numEls; i += numThreads) {" # calculate the data pointers for all arguments print >> sio, " int ii = i;" for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " const float * ii_i%i_data = i%i_data;" % (ipos, ipos) for ipos, i in enumerate(node.outputs): print >> sio, " float * ii_o%i_data = o%i_data;" % (ipos, ipos) for d in xrange(nd-1, -1, -1): if d > 0: print >> sio, " int pos%i = ii %% dim%i;" % (d, d) print >> sio, " ii = ii / dim%i;" % d else: print >> sio, " int pos%i = ii;" % d for ipos, i in enumerate(node.inputs): if not _logical_scalar(i): print >> sio, " ii_i%i_data += pos%i * i%i_str_%i;" % (ipos, d, ipos, d) for ipos, i in enumerate(node.outputs): print >> sio, " ii_o%i_data += pos%i * o%i_str_%i;" % (ipos, d, ipos, d) # perform the scalar operation on the input and output references # TODO: What if the scalar_op needs support_code?? task_code = self.scalar_op.c_code( Apply(self.scalar_op, [scalar.Scalar(dtype=input.type.dtype).make_variable() for input in node.inputs], [scalar.Scalar(dtype=output.type.dtype).make_variable() for output in node.outputs]), nodename + '_scalar_', get_str_list_logical_scalar(node), ['ii_o%i_data[0]' % ipos for ipos, i in enumerate(node.outputs)], sub=dict(fail='return;')) # TODO: set a failure code somehow!!! print >> sio, " ", task_code print >> sio, " }" #indent = " "*(4*d+7) # for ipos, i in enumerate(node.inputs): #print >> sio, indent, "const float * i%i" % ipos, '= i%i_data', '' print >> sio, "}" # print sio.getvalue() return sio.getvalue()
def c_code(self, node, nodename, inputs, outputs, sub): d = dict(sub) nd = node.outputs[0].type.ndim d.update(locals()) sio = StringIO() nin = len(inputs) nout = len(outputs) fail = sub['fail'] opname = str(self.scalar_op) initial_dims = ','.join('1' for i in xrange(nd)) if 1 or self.scalar_op == scalar.pow: print >> sio, """ //std::cerr << "C_CODE %(opname)s START\\n"; //standard elemwise size checks """ % locals() if nd > 0: print >> sio, """ int dims[%(nd)s] = {%(initial_dims)s}; """ % locals() else: print >> sio, """ int *dims = NULL; """ # check that all inputs have valid dimensions emitted_inames = {} for id, iname in enumerate(inputs): if iname in emitted_inames: assert emitted_inames[iname] is node.inputs[id] continue # with python 2.4 (at least), if a broadcastable pattern is made of # numpy.bool_ instead of bool, calling int() once is not enough. broadcasts = map(int, map(int, node.inputs[id].broadcastable)) broadcasts = ', '.join(map(str, broadcasts)) nd = node.inputs[id].ndim if nd > 0: print >> sio, """ int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; """ % locals() else: print >> sio, """ int *broadcasts_%(iname)s = NULL; """ % locals() emitted_inames[iname] = node.inputs[id] # check that all inputs have valid dimensions emitted_inames = {} for id, iname in enumerate(inputs): if iname in emitted_inames: continue print >> sio, """ //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n"; if (%(nd)s != %(iname)s->nd) { PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", %(iname)s->nd); %(fail)s; } for (int i = 0; i< %(nd)s; ++i) { dims[i] = (dims[i] == 1) ? CudaNdarray_HOST_DIMS(%(iname)s)[i] : dims[i]; if ((!(broadcasts_%(iname)s[i] && CudaNdarray_HOST_DIMS(%(iname)s)[i] == 1)) && (dims[i] != CudaNdarray_HOST_DIMS(%(iname)s)[i])) { //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n"; PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. Input" " %(id)d (indices start at 0) has shape[%%i] == %%i" ", but the output's size on that axis is %%i.", i, CudaNdarray_HOST_DIMS(%(iname)s)[i], dims[i] ); %(fail)s; } } """ % locals() emitted_inames[iname] = True # check that all outputs have valid dimensions for idx, oname in enumerate(outputs): if idx not in self.inplace_pattern.keys(): print >> sio, """ for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) { Py_DECREF(%(oname)s); %(oname)s = NULL; } } if (%(oname)s && !CudaNdarray_is_c_contiguous(%(oname)s)) { Py_XDECREF(%(oname)s); %(oname)s = NULL; } if (NULL == %(oname)s) { %(oname)s = (CudaNdarray*)CudaNdarray_New(); if (!%(oname)s) { //error string already set %(fail)s; } if (CudaNdarray_alloc_contiguous(%(oname)s, %(nd)s, dims)) { //error string already set Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() else: input_idx = self.inplace_pattern[idx] iname = inputs[input_idx] print >> sio, """ Py_XDECREF(%(oname)s); %(oname)s = %(iname)s; Py_INCREF(%(oname)s); for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != CudaNdarray_HOST_DIMS(%(oname)s)[i]) { PyErr_Format(PyExc_ValueError, "GpuElemwise. Output dimension mis-match. Output" " %(idx)d (indices start at 0), working inplace" " on input %(input_idx)s, has shape[%%i] == %%i" ", but the output's size on that axis is %%i.", i, CudaNdarray_HOST_DIMS(%(oname)s)[i], dims[i] ); Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << %(oname)s->nd << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() print >> sio, """ { //new block so that failure gotos don't skip over variable initialization //std::cerr << "calling callkernel\\n"; if (callkernel_%(nodename)s(1, 0, dims """ % locals() for iname in inputs: print >> sio, """ , CudaNdarray_DEV_DATA(%(iname)s), CudaNdarray_HOST_STRIDES(%(iname)s) """ % locals() for oname in outputs: print >> sio, """ , CudaNdarray_DEV_DATA(%(oname)s), CudaNdarray_HOST_STRIDES(%(oname)s) """ % locals() print >> sio, """ )) { // error """ for oname in outputs: print >> sio, """ Py_DECREF(%(oname)s); %(oname)s = NULL; """ % locals() print >> sio, """ %(fail)s; } else // no error { } } //std::cerr << "C_CODE %(opname)s END\\n"; """ % locals() # print sio.getvalue() return sio.getvalue()
def debugprint(obj, depth=-1, print_type=False, file=None, ids='CHAR', stop_on_name=False, done=None): """Print a computation graph as text to stdout or a file. :type obj: Variable, Apply, or Function instance :param obj: symbolic thing to print :type depth: integer :param depth: print graph to this depth (-1 for unlimited) :type print_type: boolean :param print_type: whether to print the type of printed objects :type file: None, 'str', or file-like object :param file: print to this file ('str' means to return a string) :type ids: str :param ids: How do we print the identifier of the variable id - print the python id value int - print integer character CHAR - print capital character "" - don't print an identifier :param stop_on_name: When True, if a node in the graph has a name, we don't print anything below it. :type done: None or dict :param done: A dict where we store the ids of printed node. Useful to have multiple call to debugprint share the same ids. :returns: string if `file` == 'str', else file arg Each line printed represents a Variable in the graph. The indentation of lines corresponds to its depth in the symbolic graph. The first part of the text identifies whether it is an input (if a name or type is printed) or the output of some Apply (in which case the Op is printed). The second part of the text is an identifier of the Variable. If print_type is True, we add a part containing the type of the Variable If a Variable is encountered multiple times in the depth-first search, it is only printed recursively the first time. Later, just the Variable identifier is printed. If an Apply has multiple outputs, then a '.N' suffix will be appended to the Apply's identifier, to indicate which output a line corresponds to. """ if not isinstance(depth, int): raise Exception("depth parameter must be an int") if file == 'str': _file = StringIO() elif file is None: _file = sys.stdout else: _file = file if done is None: done = dict() results_to_print = [] profile_list = [] order = [] if isinstance(obj, (list, tuple)): lobj = obj else: lobj = [obj] for obj in lobj: if isinstance(obj, gof.Variable): results_to_print.append(obj) profile_list.append(None) elif isinstance(obj, gof.Apply): results_to_print.extend(obj.outputs) profile_list.extend([None for item in obj.outputs]) elif isinstance(obj, Function): results_to_print.extend(obj.maker.fgraph.outputs) profile_list.extend( [obj.profile for item in obj.maker.fgraph.outputs]) order = obj.maker.fgraph.toposort() elif isinstance(obj, gof.FunctionGraph): results_to_print.extend(obj.outputs) profile_list.extend([None for item in obj.outputs]) order = obj.toposort() elif isinstance(obj, (int, long, float, np.ndarray)): print(obj) elif isinstance(obj, (theano.In, theano.Out)): results_to_print.append(obj.variable) profile_list.append(None) else: raise TypeError("debugprint cannot print an object of this type", obj) scan_ops = [] for r, p in zip(results_to_print, profile_list): # Add the parent scan op to the list as well if (hasattr(r.owner, 'op') and isinstance(r.owner.op, theano.scan_module.scan_op.Scan)): scan_ops.append(r) if p is not None: print(""" Timing Info ----------- --> <time> <% time> - <total time> <% total time>' <time> computation time for this node <% time> fraction of total computation time for this node <total time> time for this node + total times for this node's ancestors <% total time> total time for this node over total computation time N.B.: * Times include the node time and the function overhead. * <total time> and <% total time> may over-count computation times if inputs to a node share a common ancestor and should be viewed as a loose upper bound. Their intended use is to help rule out potential nodes to remove when optimizing a graph because their <total time> is very low. """, file=_file) debugmode.debugprint(r, depth=depth, done=done, print_type=print_type, file=_file, order=order, ids=ids, scan_ops=scan_ops, stop_on_name=stop_on_name, profile=p) if len(scan_ops) > 0: print("", file=_file) new_prefix = ' >' new_prefix_child = ' >' print("Inner graphs of the scan ops:", file=_file) for s in scan_ops: print("", file=_file) debugmode.debugprint(s, depth=depth, done=done, print_type=print_type, file=_file, ids=ids, scan_ops=scan_ops, stop_on_name=stop_on_name) if hasattr(s.owner.op, 'fn'): # If the op was compiled, print the optimized version. outputs = s.owner.op.fn.maker.fgraph.outputs else: outputs = s.owner.op.outputs for idx, i in enumerate(outputs): if hasattr(i, 'owner') and hasattr(i.owner, 'op'): if isinstance(i.owner.op, theano.scan_module.scan_op.Scan): scan_ops.append(i) debugmode.debugprint(r=i, prefix=new_prefix, depth=depth, done=done, print_type=print_type, file=_file, ids=ids, stop_on_name=stop_on_name, prefix_child=new_prefix_child, scan_ops=scan_ops) if file is _file: return file elif file == 'str': return _file.getvalue() else: _file.flush()
def debugprint(obj, depth=-1, print_type=False, file=None, ids='CHAR', stop_on_name=False): """Print a computation graph as text to stdout or a file. :type obj: Variable, Apply, or Function instance :param obj: symbolic thing to print :type depth: integer :param depth: print graph to this depth (-1 for unlimited) :type print_type: boolean :param print_type: whether to print the type of printed objects :type file: None, 'str', or file-like object :param file: print to this file ('str' means to return a string) :type ids: str :param ids: How do we print the identifier of the variable id - print the python id value int - print integer character CHAR - print capital character "" - don't print an identifier :param stop_on_name: When True, if a node in the graph has a name, we don't print anything below it. :returns: string if `file` == 'str', else file arg Each line printed represents a Variable in the graph. The indentation of lines corresponds to its depth in the symbolic graph. The first part of the text identifies whether it is an input (if a name or type is printed) or the output of some Apply (in which case the Op is printed). The second part of the text is an identifier of the Variable. If print_type is True, we add a part containing the type of the Variable If a Variable is encountered multiple times in the depth-first search, it is only printed recursively the first time. Later, just the Variable identifier is printed. If an Apply has multiple outputs, then a '.N' suffix will be appended to the Apply's identifier, to indicate which output a line corresponds to. """ if file == 'str': _file = StringIO() elif file is None: _file = sys.stdout else: _file = file done = dict() results_to_print = [] order = [] if isinstance(obj, gof.Variable): results_to_print.append(obj) elif isinstance(obj, gof.Apply): results_to_print.extend(obj.outputs) elif isinstance(obj, Function): results_to_print.extend(obj.maker.fgraph.outputs) order = obj.maker.fgraph.toposort() elif isinstance(obj, (list, tuple)): results_to_print.extend(obj) elif isinstance(obj, gof.FunctionGraph): results_to_print.extend(obj.outputs) order = obj.toposort() else: raise TypeError("debugprint cannot print an object of this type", obj) for r in results_to_print: debugmode.debugprint(r, depth=depth, done=done, print_type=print_type, file=_file, order=order, ids=ids, stop_on_name=stop_on_name) if file is _file: return file elif file == 'str': return _file.getvalue() else: _file.flush()
def debugprint(obj, depth=-1, print_type=False, file=None, ids='CHAR', stop_on_name=False, done=None): """Print a computation graph as text to stdout or a file. :type obj: Variable, Apply, or Function instance :param obj: symbolic thing to print :type depth: integer :param depth: print graph to this depth (-1 for unlimited) :type print_type: boolean :param print_type: whether to print the type of printed objects :type file: None, 'str', or file-like object :param file: print to this file ('str' means to return a string) :type ids: str :param ids: How do we print the identifier of the variable id - print the python id value int - print integer character CHAR - print capital character "" - don't print an identifier :param stop_on_name: When True, if a node in the graph has a name, we don't print anything below it. :type done: None or dict :param done: A dict where we store the ids of printed node. Useful to have multiple call to debugprint share the same ids. :returns: string if `file` == 'str', else file arg Each line printed represents a Variable in the graph. The indentation of lines corresponds to its depth in the symbolic graph. The first part of the text identifies whether it is an input (if a name or type is printed) or the output of some Apply (in which case the Op is printed). The second part of the text is an identifier of the Variable. If print_type is True, we add a part containing the type of the Variable If a Variable is encountered multiple times in the depth-first search, it is only printed recursively the first time. Later, just the Variable identifier is printed. If an Apply has multiple outputs, then a '.N' suffix will be appended to the Apply's identifier, to indicate which output a line corresponds to. """ if not isinstance(depth, int): raise Exception("depth parameter must be an int") if file == 'str': _file = StringIO() elif file is None: _file = sys.stdout else: _file = file if done is None: done = dict() results_to_print = [] profile_list = [] order = [] if isinstance(obj, (list, tuple)): lobj = obj else: lobj = [obj] for obj in lobj: if isinstance(obj, gof.Variable): results_to_print.append(obj) profile_list.append(None) elif isinstance(obj, gof.Apply): results_to_print.extend(obj.outputs) profile_list.extend([None for item in obj.outputs]) elif isinstance(obj, Function): results_to_print.extend(obj.maker.fgraph.outputs) profile_list.extend( [obj.profile for item in obj.maker.fgraph.outputs]) order = obj.maker.fgraph.toposort() elif isinstance(obj, gof.FunctionGraph): results_to_print.extend(obj.outputs) profile_list.extend([None for item in obj.outputs]) order = obj.toposort() elif isinstance(obj, (int, long, float, np.ndarray)): print obj elif isinstance(obj, (theano.In, theano.Out)): results_to_print.append(obj.variable) profile_list.append(None) else: raise TypeError("debugprint cannot print an object of this type", obj) scan_ops = [] for r, p in zip(results_to_print, profile_list): # Add the parent scan op to the list as well if (hasattr(r.owner, 'op') and isinstance(r.owner.op, theano.scan_module.scan_op.Scan)): scan_ops.append(r) if p is not None: print >> _file, """ Timing Info ----------- --> <time> <% time> - <total time> <% total time>' <time> computation time for this node <% time> fraction of total computation time for this node <total time> time for this node + total times for this node's ancestors <% total time> total time for this node over total computation time N.B.: * Times include the node time and the function overhead. * <total time> and <% total time> may over-count computation times if inputs to a node share a common ancestor and should be viewed as a loose upper bound. Their intended use is to help rule out potential nodes to remove when optimizing a graph because their <total time> is very low. """ debugmode.debugprint(r, depth=depth, done=done, print_type=print_type, file=_file, order=order, ids=ids, scan_ops=scan_ops, stop_on_name=stop_on_name, profile=p) if len(scan_ops) > 0: print >> _file, "" new_prefix = ' >' new_prefix_child = ' >' print >> _file, "Inner graphs of the scan ops:" for s in scan_ops: print >> _file, "" debugmode.debugprint(s, depth=depth, done=done, print_type=print_type, file=_file, ids=ids, scan_ops=scan_ops, stop_on_name=stop_on_name) if hasattr(s.owner.op, 'fn'): # If the op was compiled, print the optimized version. outputs = s.owner.op.fn.maker.fgraph.outputs else: outputs = s.owner.op.outputs for idx, i in enumerate(outputs): if hasattr(i, 'owner') and hasattr(i.owner, 'op'): if isinstance(i.owner.op, theano.scan_module.scan_op.Scan): scan_ops.append(i) debugmode.debugprint(r=i, prefix=new_prefix, depth=depth, done=done, print_type=print_type, file=_file, ids=ids, stop_on_name=stop_on_name, prefix_child=new_prefix_child, scan_ops=scan_ops) if file is _file: return file elif file == 'str': return _file.getvalue() else: _file.flush()