Example #1
0
import stat
import sys

import theano
from theano.compat import get_unbound_function
from theano.compile import optdb
from theano.gof import EquilibriumDB, SequenceDB
from theano.gof.cmodule import get_lib_extension
from theano.gof.compilelock import get_lock, release_lock
from theano.configparser import config, AddConfigVar, StrParam, BoolParam
import nvcc_compiler

# ignore_newtrees is to speed the optimization as this is the pattern
# we use for optimization. Otherwise, we can iterate 100s of time on
# the graph and apply only a few optimizations each time.
gpu_optimizer = EquilibriumDB(ignore_newtrees=False)
gpu_seqopt = SequenceDB()


def register_opt(*tags, **kwargs):
    if any([not isinstance(t, str) for t in tags]):
        raise RuntimeError(
            "Bad call to register_opt."
            " All tags must be strings.", tags)

    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, 'fast_run', 'fast_compile',
                               'gpu', *tags)
        return local_opt
Example #2
0
    GpuSoftmaxWithBias,
    GpuSoftmax,
)
from .elemwise import GpuElemwise, GpuDimShuffle, GpuCAReduceCuda, GpuCAReduceCPY
from .subtensor import (
    GpuIncSubtensor,
    GpuSubtensor,
    GpuAdvancedSubtensor1,
    GpuAdvancedIncSubtensor1,
    GpuAdvancedIncSubtensor1_dev20,
)
from .opt_util import alpha_merge, output_merge

_logger = logging.getLogger("theano.sandbox.gpuarray.opt")

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

# Don't register this right now
conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts"

gpu_seqopt.register("gpuarray_local_optimiziations", gpu_optimizer, 1, "fast_compile", "fast_run", "gpuarray")
gpu_seqopt.register("gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_compile", "fast_run", "gpuarray")

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register("gpuarray_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpuarray")

Example #3
0
                        HostFromGpu, GpuFromHost,
                        GpuSplit, GpuContiguous,
                        gpu_alloc, GpuAlloc, GpuReshape,
                        GpuEye, gpu_join, GpuJoin)
from .blas import gpu_dot22, GpuGemv, GpuGemm, GpuGer
from .conv import GpuConv
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                   GpuCrossentropySoftmax1HotWithBiasDx,
                   GpuSoftmaxWithBias, GpuSoftmax)
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
                       GpuCAReduceCPY)
from .subtensor import (GpuIncSubtensor, GpuSubtensor,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
                    'fast_compile', 'fast_run', 'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')

Example #4
0
                                               gpu_alloc,
                                               gpu_shape,
                                               GpuAlloc,
                                               GpuShape,
                                               GpuReshape,
                                               GpuEye)
from theano.sandbox.gpuarray.blas import gpu_dot22, GpuGemv, GpuGemm
from theano.sandbox.gpuarray.nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                                          GpuCrossentropySoftmax1HotWithBiasDx)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
from theano.sandbox.gpuarray.type import GpuArrayConstant

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
                    'fast_run', 'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')


def register_opt(*tags, **kwargs):
Example #5
0
                block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
            pycuda_fct(inputs[0][0],
                       inputs[1][0],
                       z[0],
                       numpy.intc(inputs[1][0].size),
                       block=block,
                       grid=grid)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk


pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")


@local_optimizer([GpuElemwise])
def local_pycuda_gpu_elemwise(node):
    """
       GpuElemwise -> PycudaElemwiseSourceModuleOp
    """
    if isinstance(node.op, GpuElemwise):
        if (not any([any(i.type.broadcastable) for i in node.inputs])
                and all([i.ndim <= 2 for i in node.inputs])):
            new_op = PycudaElemwiseSourceModuleOp(
                node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
            return [new_op]
Example #6
0
from .blas import (gpu_dot22, GpuGemv, GpuGemm, GpuGer, GpuGemmBatch,
                   gpugemm_no_inplace, gpugemmbatch_no_inplace)
from .blocksparse import GpuSparseBlockGemv, GpuSparseBlockOuter
from .nnet import (GpuCrossentropySoftmaxArgmax1HotWithBias,
                   GpuCrossentropySoftmax1HotWithBiasDx, GpuSoftmaxWithBias,
                   GpuSoftmax)
from .elemwise import (GpuElemwise, GpuDimShuffle, GpuCAReduceCuda,
                       GpuCAReduceCPY)
from .subtensor import (GpuIncSubtensor, GpuSubtensor, GpuAdvancedSubtensor1,
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)
from .opt_util import alpha_merge, output_merge

_logger = logging.getLogger("theano.gpuarray.opt")

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

# Don't register this right now
conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts"

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_compile', 'fast_run', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
                    'fast_compile', 'fast_run', 'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
Example #7
0
from theano.compile import optdb
from theano.gof import (local_optimizer, EquilibriumDB, SequenceDB, ProxyDB,
                        Optimizer, toolbox, DestroyHandler, InconsistencyError,
                        EquilibriumOptimizer)

from theano.gof.python25 import all, any
from theano.sandbox.gpuarray.type import GpuArrayType

from theano.sandbox.gpuarray.basic_ops import (host_from_gpu, gpu_from_host,
                                               gpu_alloc, GpuReshape)
from theano.sandbox.gpuarray.elemwise import (GpuElemwise, _is_scalar,
                                              GpuDimShuffle, GpuCAReduce)
from theano.sandbox.gpuarray.subtensor import GpuSubtensor
from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_run',
                    'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')

Example #8
0
File: opt.py Project: jlowin/Theano
    GpuCrossentropySoftmaxArgmax1HotWithBias,
    GpuCrossentropySoftmax1HotWithBiasDx,
    GpuSoftmaxWithBias,
    GpuSoftmax,
)
from theano.sandbox.gpuarray.elemwise import GpuElemwise, _is_scalar, GpuDimShuffle, GpuCAReduceCuda
from theano.sandbox.gpuarray.subtensor import (
    GpuIncSubtensor,
    GpuSubtensor,
    GpuAdvancedIncSubtensor1,
    GpuAdvancedIncSubtensor1_dev20,
)
from theano.sandbox.gpuarray.type import GpuArrayConstant

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

gpu_seqopt.register("gpuarray_local_optimiziations", gpu_optimizer, 1, "fast_run", "inplace", "gpuarray")
gpu_seqopt.register("gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_run", "gpuarray")

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register("gpuarray_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpuarray")


def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags)
        return local_opt
Example #9
0
            ", ".join([var.type.dtype_specs()[1]+" *"+name for var,name in zip(inputs,in_name) + zip(out_node.outputs,out_name)]),
            c_code,
            "pycuda_elemwise_kernel_%s"%str(self.scalar_op),
            preamble="""#include<Python.h>
#include <numpy/arrayobject.h>""")
        return out_node

    def perform(self, node, inputs, out):
        #TODO assert all input have the same shape
        z, = out
        if z[0] is None or z[0].shape!=inputs[0].shape:
            z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape)
        i = inputs + z
        self.pycuda_fct(*i)

pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")

@local_optimizer([])
def local_pycuda_gpu_elemwise(node):
    """
       GpuElemwise -> PycudaElemwiseSourceModuleOp
    """
    if isinstance(node.op, GpuElemwise):
        if not any([ any(i.type.broadcastable) for i in node.inputs]) and all([i.ndim<=2 for i in node.inputs]):
            new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op, node.op.inplace_pattern)(*node.inputs)
            return [new_op]

pycuda_optimizer.register("local_pycuda_gpu_elemwise", local_pycuda_gpu_elemwise)

@local_optimizer([])
Example #10
0
                grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
                block = (512, 1, 1)
            else:
                grid = (1, 1)
                block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
            pycuda_fct(inputs[0][0], inputs[1][0], z[0],
                       numpy.intc(inputs[1][0].size), block=block,
                       grid=grid)
        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk


pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")


@local_optimizer([GpuElemwise])
def local_pycuda_gpu_elemwise(node):
    """
       GpuElemwise -> PycudaElemwiseSourceModuleOp
    """
    if isinstance(node.op, GpuElemwise):
        if (not any([any(i.type.broadcastable) for i in node.inputs]) and
                all([i.ndim <= 2 for i in node.inputs])):
            new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op,
                                                  node.op.inplace_pattern)(
                                                      *node.inputs)
            return [new_op]
Example #11
0
                grid = (int(numpy.ceil(inputs[0][0].size / 512.)), 1)
                block = (512, 1, 1)
            else:
                grid = (1, 1)
                block = (inputs[0][0].shape[0], inputs[0][0].shape[1], 1)
            out = pycuda_fct(inputs[0][0], inputs[1][0], z[0],
                             numpy.intc(inputs[1][0].size), block=block,
                             grid=grid)
        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk


pycuda_optimizer = EquilibriumDB()
gpu_seqopt.register("pycuda_optimizer", pycuda_optimizer, 1.5, "fast_run")


@local_optimizer([])
def local_pycuda_gpu_elemwise(node):
    """
       GpuElemwise -> PycudaElemwiseSourceModuleOp
    """
    if isinstance(node.op, GpuElemwise):
        if (not any([any(i.type.broadcastable) for i in node.inputs]) and
            all([i.ndim <= 2 for i in node.inputs])):
            new_op = PycudaElemwiseSourceModuleOp(node.op.scalar_op,
                                                  node.op.inplace_pattern)(
                                                      *node.inputs)
            return [new_op]