from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch, gpugemm_no_inplace, gpugemmbatch_no_inplace) from .blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias, GpuCrossentropySoftmax1HotWithBiasDx, GpuSoftmaxWithBias, GpuSoftmax) from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, GpuCAReduceCPY) from .subtensor import (GpuIncSubtensor, GpuSubtensor, GpuAdvancedSubtensor1, GpuAdvancedIncSubtensor1, GpuAdvancedIncSubtensor1_dev20) from .opt_util import alpha_merge, output_merge _logger = logging.getLogger("theano.gpuarray.opt") gpu_optimizer = EquilibriumDB() gpu_cut_copies = EquilibriumDB() gpu_seqopt = SequenceDB() # Don't register this right now conv_groupopt = LocalGroupDB() conv_groupopt.__name__ = "gpua_conv_opts" gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1, 'fast_compile', 'fast_run', 'gpuarray') gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_compile', 'fast_run', 'gpuarray') # do not add 'fast_run' to these two as this would always enable gpuarray mode optdb.register('gpuarray_opt', gpu_seqopt,
import stat import sys import theano from theano.compat import get_unbound_function from theano.compile import optdb from theano.gof import EquilibriumDB, SequenceDB from theano.gof.cmodule import get_lib_extension from theano.gof.compilelock import get_lock, release_lock from theano.configparser import config, AddConfigVar, StrParam, BoolParam import nvcc_compiler # ignore_newtrees is to speed the optimization as this is the pattern # we use for optimization. Otherwise, we can iterate 100s of time on # the graph and apply only a few optimizations each time. gpu_optimizer = EquilibriumDB(ignore_newtrees=False) gpu_seqopt = SequenceDB() def register_opt(*tags, **kwargs): if any([not isinstance(t, str) for t in tags]): raise RuntimeError( "Bad call to register_opt." " All tags must be strings.", tags) def f(local_opt): name = (kwargs and kwargs.pop('name')) or local_opt.__name__ gpu_optimizer.register(name, local_opt, 'fast_run', 'fast_compile', 'gpu', *tags) return local_opt
block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1) pycuda_fct(inputs[0][0], inputs[1][0], z[0], numpy.intc(inputs[1][0].size), block=block, grid=grid) thunk.inputs = inputs thunk.outputs = outputs thunk.lazy = False return thunk pycuda_optimizer = EquilibriumDB() gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run") @local_optimizer([GpuElemwise]) def local_pycuda_gpu_elemwise(node): """ GpuElemwise -> PycudaElemwiseSourceModuleOp """ if isinstance(node.op, GpuElemwise): if (not any([any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim <= 2 for i in node.inputs])): new_op = PycudaElemwiseSourceModuleOp( node.op.scalar_op, node.op.inplace_pattern)(*node.inputs) return [new_op]