Esempio n. 1
0
    def __init__(self, pool_shape, inplace, BCHW_grad_output):
        pool_shape = tuple(pool_shape)
        super(PoolHWBCOpGrad, self).__init__()
        assert len(pool_shape) == 2, len(pool_shape)
        assert pool_shape[0] > 0, pool_shape[0]
        assert pool_shape[1] > 0, pool_shape[1]
        if BCHW_grad_output:
            assert inplace
        self.pool_shape = pool_shape
        self.inplace = inplace
        self.BCHW_grad_output = BCHW_grad_output

        if inplace:
            self.destroy_map = {0: [0]}
        #register optimization for this pool_shape
        else:
            if not hasattr(optdb, 'PoolHWBCOpGradInplaceOpt_registered'):
                optdb.PoolHWBCOpGradInplaceOpt_registered = []
            if pool_shape not in optdb.PoolHWBCOpGradInplaceOpt_registered:
                PoolHWBCOpGradInplaceOpt = OpSub(
                    self,
                    PoolHWBCOpGrad(self.pool_shape,
                                   inplace=True,
                                   BCHW_grad_output=False))
                optdb.PoolHWBCOpGradInplaceOpt_registered.append(pool_shape)
                optdb.register(
                    'PoolHWBCOpGradInplaceOpt' + str(pool_shape),
                    theano.gof.TopoOptimizer(
                        PoolHWBCOpGradInplaceOpt,
                        failure_callback=gof.TopoOptimizer.warn_inplace), 50.0,
                    'fast_run', 'inplace', 'gpuarray')
Esempio n. 2
0
 def f(local_opt):
     name = (kwargs and kwargs.pop('name')) or local_opt.__name__
     optdb.register(
         name, TopoOptimizer(
             local_opt, failure_callback=TopoOptimizer.warn_inplace),
         60, 'fast_run', 'inplace', 'gpu', *tags)
     return local_opt
Esempio n. 3
0
 def f(local_opt):
     name = (kwargs and kwargs.pop('name')) or local_opt.__name__
     optdb.register(
         name, TopoOptimizer(
             local_opt, failure_callback=TopoOptimizer.warn_inplace),
         60, 'fast_run', 'inplace', 'gpuarray', *tags)
     return local_opt
Esempio n. 4
0
 def f(local_opt):
     name = (kwargs and kwargs.pop("name")) or local_opt.__name__
     optdb.register(
         name,
         TopoOptimizer(local_opt,
                       failure_callback=TopoOptimizer.warn_inplace),
         60,
         "fast_run",
         "inplace",
         "gpuarray",
         *tags,
     )
     return local_opt
Esempio n. 5
0
def register_func(recurrent_transform):
    """
  :type recurrent_transform: RecurrentTransform.RecurrentTransformBase
  """
    fn = recurrent_transform.name
    key = (fn, id(recurrent_transform))
    if key in function_ops:
        return function_ops[key]

    # register op
    no_inpl = LSTMCustomOp(fun_name=fn,
                           inplace=False,
                           recurrent_transform=recurrent_transform)
    inpl = LSTMCustomOp(fun_name=fn,
                        inplace=True,
                        recurrent_transform=recurrent_transform)
    function_ops[key] = no_inpl

    # hack to avoid being called twice
    attr = 'LSTMCustomMOpInplaceOpt_%s_%i' % (fn, id(recurrent_transform))
    if not hasattr(optdb, attr):
        opt = OpSub(no_inpl, inpl)
        optdb.register(attr, theano.gof.TopoOptimizer(opt), 50.0, 'fast_run',
                       'inplace', 'gpuarray')
        setattr(optdb, attr, True)

    # the same for grad
    no_inpl = LSTMCustomOpGrad(fun_name=fn,
                               inplace=False,
                               recurrent_transform=recurrent_transform)
    inpl = LSTMCustomOpGrad(fun_name=fn,
                            inplace=True,
                            recurrent_transform=recurrent_transform)
    grad_ops[key] = no_inpl

    # hack to avoid being called twice
    attr = 'LSTMCustomMOpGradInplaceOpt_%s_%i' % (fn, id(recurrent_transform))
    if not hasattr(optdb, attr):
        opt = OpSub(no_inpl, inpl)
        optdb.register(attr, theano.gof.TopoOptimizer(opt), 50.0, 'fast_run',
                       'inplace', 'gpuarray')
        setattr(optdb, attr, True)

    return function_ops[key]
Esempio n. 6
0
  def __init__(self, pool_shape, inplace, BCHW_grad_output):
    pool_shape = tuple(pool_shape)
    super(PoolHWBCOpGrad, self).__init__()
    assert len(pool_shape) == 2, len(pool_shape)
    assert pool_shape[0] > 0, pool_shape[0]
    assert pool_shape[1] > 0, pool_shape[1]
    if BCHW_grad_output:
      assert inplace
    self.pool_shape = pool_shape
    self.inplace = inplace
    self.BCHW_grad_output = BCHW_grad_output

    if inplace:
      self.destroy_map = {0: [0]}
    #register optimization for this pool_shape
    else:
      if not hasattr(optdb, 'PoolHWBCOpGradInplaceOpt_registered'):
        optdb.PoolHWBCOpGradInplaceOpt_registered = []
      if pool_shape not in optdb.PoolHWBCOpGradInplaceOpt_registered:
        PoolHWBCOpGradInplaceOpt = OpSub(self, PoolHWBCOpGrad(self.pool_shape, inplace=True, BCHW_grad_output=False))
        optdb.PoolHWBCOpGradInplaceOpt_registered.append(pool_shape)
        optdb.register('PoolHWBCOpGradInplaceOpt' + str(pool_shape),
                       theano.gof.TopoOptimizer(PoolHWBCOpGradInplaceOpt, failure_callback=gof.TopoOptimizer.warn_inplace),
                       50.0, 'fast_run', 'inplace', 'gpuarray')
Esempio n. 7
0
def register_func(recurrent_transform):
  """
  :type recurrent_transform: RecurrentTransform.RecurrentTransformBase
  """
  fn = recurrent_transform.name
  key = (fn, id(recurrent_transform))
  if key in function_ops:
    return function_ops[key]

  # register op
  no_inpl = LSTMCustomOp(fun_name=fn, inplace=False, recurrent_transform=recurrent_transform)
  inpl = LSTMCustomOp(fun_name=fn, inplace=True, recurrent_transform=recurrent_transform)
  function_ops[key] = no_inpl

  # hack to avoid being called twice
  attr = 'LSTMCustomMOpInplaceOpt_%s_%i' % (fn, id(recurrent_transform))
  if not hasattr(optdb, attr):
    opt = OpSub(no_inpl, inpl)
    optdb.register(attr, theano.gof.TopoOptimizer(opt),
                   50.0, 'fast_run', 'inplace', 'gpuarray')
    setattr(optdb, attr, True)

  # the same for grad
  no_inpl = LSTMCustomOpGrad(fun_name=fn, inplace=False, recurrent_transform=recurrent_transform)
  inpl = LSTMCustomOpGrad(fun_name=fn, inplace=True, recurrent_transform=recurrent_transform)
  grad_ops[key] = no_inpl

  # hack to avoid being called twice
  attr = 'LSTMCustomMOpGradInplaceOpt_%s_%i' % (fn, id(recurrent_transform))
  if not hasattr(optdb, attr):
    opt = OpSub(no_inpl, inpl)
    optdb.register(attr, theano.gof.TopoOptimizer(opt),
                   50.0, 'fast_run', 'inplace', 'gpuarray')
    setattr(optdb, attr, True)

  return function_ops[key]
Esempio n. 8
0
            # we wont need this copy anymore
            output[0] = variable.copy()


@gof.local_optimizer([OpFromGraph])
def inline_ofg_expansion(node):
    """
    This optimization expands internal graph of OpFromGraph.
    Only performed if node.op.is_inline == True
    Doing so can improve optimization at the cost of compilation speed.
    """
    op = node.op
    if not isinstance(op, OpFromGraph):
        return False
    if not op.is_inline:
        return False
    return theano.clone(
        op.local_outputs,
        {u: v
         for u, v in izip(node.op.local_inputs, node.inputs)})


# We want to run this before the first merge optimizer
# and before the first scan optimizer.
optdb.register('inline_ofg_expansion', gof.opt.in2out(inline_ofg_expansion),
               -0.01, 'fast_compile', 'fast_run')

# Since OpFromGraph contains a Theano compiled function,
# we should let DebugMode know about it
ops_with_inner_function[OpFromGraph] = 'fn'
Esempio n. 9
0
                                                                inplace=False)
CuDNNConvHWBCOpGradValidInplaceInstance = CuDNNConvHWBCOpGrad("valid",
                                                              inplace=True)
CuDNNConvHWBCOpGradFullNoInplaceInstance = CuDNNConvHWBCOpGrad("full",
                                                               inplace=False)
CuDNNConvHWBCOpGradFullInplaceInstance = CuDNNConvHWBCOpGrad("full",
                                                             inplace=True)

CuDNNConvHWBCOpGradValidInplaceOpt = OpSub(
    CuDNNConvHWBCOpGradValidNoInplaceInstance,
    CuDNNConvHWBCOpGradValidInplaceInstance)
#hack to avoid being called twice
if not hasattr(optdb, 'CuDNNConvHWBCOpGradValidInplaceOpt_registered'):
    optdb.register(
        'CuDNNConvHWBCOpGradValidInplaceOpt',
        theano.gof.TopoOptimizer(
            CuDNNConvHWBCOpGradValidInplaceOpt,
            failure_callback=gof.TopoOptimizer.warn_inplace), 50.0, 'fast_run',
        'inplace', 'gpuarray')
    optdb.CuDNNConvHWBCOpGradValidInplaceOpt_registered = True

#TODO: maybe this optimization causes problems
#CuDNNConvHWBCOpGradFullInplaceOpt = OpSub(CuDNNConvHWBCOpGradFullNoInplaceInstance, CuDNNConvHWBCOpGradFullInplaceInstance)
##hack to avoid being called twice
#if not hasattr(optdb, 'CuDNNConvHWBCOpGradFullInplaceOpt_registered'):
#  optdb.register('CuDNNConvHWBCOpGradFullInplaceOpt',
#                 theano.gof.TopoOptimizer(CuDNNConvHWBCOpGradFullInplaceOpt, failure_callback=gof.TopoOptimizer.warn_inplace),
#                 50.0, 'fast_run', 'inplace', 'gpuarray')
#  optdb.CuDNNConvHWBCOpGradFullInplaceOpt_registered = True

#------------------------------------------------------
  def infer_shape(self, node, input_shapes):
    return input_shapes[0],

  #def c_code_cache_version(self):
  #  return 1, 0


CropToBatchImageSizeInstance = CropToBatchImageSizeOp(-1e20, False)
CropToBatchImageSizeInplaceInstance = CropToBatchImageSizeOp(-1e20, True)
CropToBatchImageSizeZeroInstance = CropToBatchImageSizeOp(0.0, False)
CropToBatchImageSizeZeroInplaceInstance = CropToBatchImageSizeOp(0.0, True)


CropToBatchImageSizeGradInplaceOpt1 = OpSub(CropToBatchImageSizeInstance, CropToBatchImageSizeInplaceInstance)
#hack to avoid being called twice
if not hasattr(optdb, 'CropToBatchImageSizeGradInplaceOpt1_registered'):
  optdb.register('CropToBatchImageSizeGradInplaceOpt1',
                 theano.gof.TopoOptimizer(CropToBatchImageSizeGradInplaceOpt1,
                                          failure_callback=gof.TopoOptimizer.warn_inplace),
                 50.0, 'fast_run', 'inplace', 'gpuarray')
  optdb.CropToBatchImageSizeGradInplaceOpt1_registered = True

CropToBatchImageSizeGradInplaceOpt2 = OpSub(CropToBatchImageSizeZeroInstance, CropToBatchImageSizeZeroInplaceInstance)
#hack to avoid being called twice
if not hasattr(optdb, 'CropToBatchImageSizeGradInplaceOpt2_registered'):
  optdb.register('CropToBatchImageSizeGradInplaceOpt2',
                 theano.gof.TopoOptimizer(CropToBatchImageSizeGradInplaceOpt2,
                                          failure_callback=gof.TopoOptimizer.warn_inplace),
                 50.0, 'fast_run', 'inplace', 'gpuarray')
  optdb.CropToBatchImageSizeGradInplaceOpt2_registered = True
Esempio n. 11
0
        grad_op = grad_op.__class__(**kwargs)
    else:
        old_grad_op_input0 = grad_op_v.owner.inputs[0]
        sum_inputs = [old_grad_op_input0] + sum_inputs
    assert len(sum_inputs) > 0
    if len(sum_inputs) == 1:
        new_grad_op_input0 = sum_inputs[0]
    else:
        new_grad_op_input0 = T.add(*sum_inputs)
    new_grad_op_inputs = [new_grad_op_input0] + grad_op_v.owner.inputs[1:]
    new_v = grad_op(*new_grad_op_inputs)
    return [new_v]


optdb.register('add_merge_MultiBatchBeamGradAddOp',
               gof.TopoOptimizer(add_merge_MultiBatchBeamGradAddOp), 0.1,
               'fast_run')


@gof.local_optimizer([MultiBatchBeamGradAddOp], inplace=True)
def inplace_MultiBatchBeamGradAddOp(node):
    if isinstance(node.op, MultiBatchBeamGradAddOp
                  ) and not node.op.inplace and not node.op.zero_with_shape:
        kwargs = {k: getattr(node.op, k) for k in node.op.__props__}
        kwargs["inplace"] = True
        new_op = node.op.__class__(**kwargs)
        new_v = new_op(*node.inputs)
        return [new_v]
    return False

Esempio n. 12
0
conv_groupopt.register('local_conv2d_gradinputs_cpu',
                       local_conv2d_gradinputs_cpu, 40,
                       'fast_compile', 'fast_run')


# Verify that no AbstractConv are present in the graph
@local_optimizer([AbstractConv2d,
                  AbstractConv2d_gradWeights,
                  AbstractConv2d_gradInputs])
def local_abstractconv_check(node):
    if isinstance(node.op, AbstractConv2d):
        raise AssertionError(
            'AbstractConv2d theano optimization failed. '
            'Did you exclude both "conv_dnn" and "conv_gemm" from '
            'the optimizer? Is cudnn available and does the GPU support it?')
    elif isinstance(node.op, AbstractConv2d_gradWeights):
        raise AssertionError(
            'AbstractConv2d_gradWeights theano optimization failed. '
            'Did you exclude both "conv_dnn" and "conv_gemm" from '
            'the optimizer? Is cudnn available and does the GPU support it?')
    elif isinstance(node.op, AbstractConv2d_gradInputs):
        raise AssertionError(
            'AbstractConv2d_gradInputs theano optimization failed. '
            'Did you exclude both "conv_dnn" and "conv_gemm" from '
            'the optimizer? Is cudnn available and does the GPU support it?')

optdb.register('AbstracConvCheck',
               opt.in2out(local_abstractconv_check,
                          name="AbstractConvCheck"),
               48.7, 'fast_compile', 'fast_run')
Esempio n. 13
0

@gof.local_optimizer([RandomFunction])
def random_make_inplace(node):
    op = node.op
    if isinstance(op, RandomFunction) and not op.inplace:
        # Read op_fn from op.state, not from op.fn, since op.fn
        # may not be picklable.
        op_fn, op_outtype, op_inplace, op_ndim_added = op._props()
        new_op = RandomFunction(op_fn, op_outtype, inplace=True,
                                ndim_added=op_ndim_added)
        return new_op.make_node(*node.inputs).outputs
    return False

optdb.register('random_make_inplace', opt.in2out(random_make_inplace,
                                                 ignore_newtrees=True),
               99, 'fast_run', 'inplace')


class RandomStreamsBase(object):

    def binomial(self, size=None, n=1, p=0.5, ndim=None, dtype='int64',
                 prob=None):
        """
        Sample n times with probability of success p for each trial and
        return the number of successes.

        If the size argument is ambiguous on the number of dimensions,
        ndim may be a plain integer to supplement the missing information.

        """
Esempio n. 14
0
gpu_dot22 = GpuDot22()

from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.opt import in2out


@local_optimizer([gpugemv_no_inplace], inplace=True)
def local_inplace_gpuagemv(node):
    if node.op == gpugemv_no_inplace:
        return [gpugemv_inplace(*node.inputs)]


@local_optimizer([gpugemm_no_inplace], inplace=True)
def local_inplace_gpuagemm(node):
    if node.op == gpugemm_no_inplace:
        return [gpugemm_inplace(*node.inputs)]


@local_optimizer([gpuger_no_inplace], inplace=True)
def local_inplace_gpuager(node):
    if node.op == gpuger_no_inplace:
        return [gpuger_inplace(*node.inputs)]

gpuablas_opt_inplace = in2out(LocalOptGroup(
        local_inplace_gpuagemv, local_inplace_gpuagemm, local_inplace_gpuager),
                              name='gpuablas_opt_inplace')
optdb.register('InplaceGpuaBlasOpt',
               gpuablas_opt_inplace,
               70.0, 'fast_run', 'inplace', 'gpuarray')
Esempio n. 15
0

@opt.register_opt()
@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
def local_gemm16_alpha_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]


@opt.register_opt()
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
def local_gemm16_output_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]


@local_optimizer([Gemm16], inplace=True)
def local_gemm16_inplace(node):
    if type(node.op) != Gemm16 or node.op.inplace:
        return
    inputs = list(node.inputs)
    C = inputs[0]
    if (C.owner and
            isinstance(C.owner.op, GpuAllocEmpty) and
            len(C.clients) > 1):
        inputs[0] = C.owner.op(*C.owner.inputs)
    return [Gemm16(relu=node.op.relu, inplace=True)(*inputs)]

optdb.register('local_gemm16_inplace',
               tensor.opt.in2out(local_gemm16_inplace,
                                 name='local_gemm16_inplace'),
               70.0, 'fast_run', 'inplace', 'gpuarray')
Esempio n. 16
0
@inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs):
    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]


@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]


optdb.register(
    "local_dnna_conv_inplace",
    tensor.opt.in2out(
        local_dnn_conv_inplace, local_dnn_convgw_inplace, local_dnn_convgi_inplace, name="local_dnna_conv_inplace"
    ),
    70.0,
    "fast_run",
    "inplace",
    "gpuarray",
    "cudnn",
)


@register_opt("cudnn")
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]


@register_opt("cudnn")
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
Esempio n. 17
0
    }

    """ % locals()

  #!!! change this when changing the code!
  def c_code_cache_version(self):
    return 1, 5

LSTMOpGradNoInplaceInstance = LSTMOpGrad(inplace=False)
LSTMOpGradInplaceInstance = LSTMOpGrad(inplace=True)

LSTMOpGradInplaceOpt = OpSub(LSTMOpGradNoInplaceInstance, LSTMOpGradInplaceInstance)

#hack to avoid being called twice
if not hasattr(optdb, 'LSTMOpGradInplaceOpt_registered'):
  optdb.register('LSTMOpGradInplaceOpt', theano.gof.TopoOptimizer(LSTMOpGradInplaceOpt),
                 50.0, 'fast_run', 'inplace', 'gpuarray')
  optdb.LSTMOpGradInplaceOpt_registered = True


#------------------------

class LSTMOp(theano.sandbox.cuda.GpuOp):
  def __init__(self, inplace):
    self.inplace = inplace
    if inplace:
      #all outputs operate inplace on input 0 (which is Z)
      #but when the input is marked multiple times, we get an error
      #so we only mark that output 0 destroys input 0
      #anyway theano knows that input 0 will be destroyed, so it should be OK
      #TODO
      self.destroy_map = {0: [0]}
Esempio n. 18
0
    GpuCrossentropySoftmaxArgmax1HotWithBias,
    GpuCrossentropySoftmax1HotWithBiasDx,
    GpuSoftmax,
    GpuSoftmaxWithBias,
)
from theano.compile import optdb
from theano.tensor.blas import _is_real_vector, _is_real_matrix

# optdb.print_summary()  # shows what is currently registered

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()
gpu_seqopt = SequenceDB()
gpu_seqopt.register("gpu_local_optimizations", gpu_optimizer, 1, "fast_run", "inplace")
gpu_seqopt.register("gpu_cut_transfers", gpu_cut_copies, 2, "fast_run", "gpu")
optdb.register("gpu_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpu")
# This second pass is needed as the fusion can put all the non float32 code
# inside the elemwise. When it there is no float64 op, this is working.
optdb.register("gpu_after_fusion", ProxyDB(gpu_seqopt), optdb.__position__.get("elemwise_fusion", 71) + 0.1, "gpu")


def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, "fast_run", "inplace", *tags)
        return local_opt

    return f


# register local_track_shape_i at this level too
Esempio n. 19
0
            ]), msg
        else:
            msg = "size must be a tuple of int or a Theano variable"
            assert isinstance(size, Variable) and size.ndim == 1, msg
        generator = theano.shared(False)  # makes a generic
        s_size = theano.tensor.as_tensor_variable(size)
        u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size,
                                          self.next_seed())
        self.state_updates.append(u.update)
        rval = u * std + avg
        if u.type.broadcastable != rval.type.broadcastable:
            raise NotImplementedError(
                'Increase the size to match the broadcasting pattern of `low`'
                'and `high` arguments')
        return rval


@local_optimizer([None])
def local_destructive(node):
    op = node.op
    if isinstance(op, CURAND_Base) and not op.destructive:
        # op might be gpu version
        new_op = op.as_destructive()
        return new_op.make_node(*node.inputs).outputs
    return False


optdb.register('CURAND_destructive',
               opt.in2out(local_destructive, ignore_newtrees=True), 99,
               'fast_run', 'inplace')
Esempio n. 20
0
    any_inplace = False
    for info in kwargs["in_info"]:
      if info.get("want_inplace", -1) >= 0:
        any_inplace = True
        info["is_inplace"] = True
    if not any_inplace:
      return False
    new_op = node.op.__class__(**kwargs)
    from TheanoUtil import make_var_tuple
    new_v = make_var_tuple(new_op(*node.inputs))
    return new_v
  return False

optdb.register('inplace_NativeOp',
               gof.TopoOptimizer(inplace_NativeOp
                                 , failure_callback=gof.TopoOptimizer.warn_inplace
                                 ),
               60, 'fast_run', 'inplace')


@try_register_gpu_opt(NativeOp)
def local_gpu_NativeOp(node):
  if isinstance(node.op, NativeOp):
    # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py
    from theano.sandbox.cuda import host_from_gpu, gpu_from_host, as_cuda_ndarray_variable
    args = node.inputs
    if any([(x.owner and x.owner.op == host_from_gpu) for x in args]):
      gpu_op = GpuNativeOp(**{key: getattr(node.op, key) for key in node.op.__props__})
      args = [x.owner.inputs[0] if (x.owner and x.owner.op == host_from_gpu) else x
              for x in args]
      from TheanoUtil import make_var_tuple
Esempio n. 21
0
  def add_requirements(self, fgraph):
    fgraph.attach_feature(toolbox.ReplaceValidate())

  def apply(self, fgraph):
    for node in fgraph.toposort():
      #print node
      if type(node.op) == GpuDimShuffle and node.op.new_order == (2, 3, 0, 1):
        X = node.inputs[0]
        if hasattr(X.owner, "op") and type(X.owner.op) == PoolHWBCOpGrad and X.owner.op.inplace:
          fgraph.replace_validate(node.outputs[0], node.inputs[0])
          replace_op = PoolHWBCOpGrad(X.owner.op.pool_shape, inplace=True, BCHW_grad_output=True)
          fgraph.replace_validate(X.owner.outputs[0], replace_op(*X.owner.inputs))

RemoveConvGradDimshuffleOptimizer = RemoveConvGradDimshuffle()
if not hasattr(optdb, 'RemoveConvGradDimshuffleOptimizer_registered'):
  optdb.register('RemoveConvGradDimshuffle', RemoveConvGradDimshuffleOptimizer, 50.5, 'fast_run', 'inplace', 'gpuarray')
  optdb.RemoveConvGradDimshuffleOptimizer_registered = True

#---------------------------


#for the moment we implement only ignore_border = True and no padding
class PoolHWBCOp(theano.sandbox.cuda.GpuOp):
  __props__ = ("pool_shape",)

  def __init__(self, pool_shape):
    pool_shape = tuple(pool_shape)
    super(PoolHWBCOp, self).__init__()
    assert len(pool_shape) == 2, len(pool_shape)
    assert pool_shape[0] > 0, pool_shape[0]
    assert pool_shape[1] > 0, pool_shape[1]
Esempio n. 22
0
File: dnn.py Progetto: orhanf/Theano
def local_dnn_conv_inplace(node, inputs):
    return [GpuDnnConv(algo=node.op.algo, inplace=True)(*inputs)]


@inplace_allocempty(GpuDnnConvGradW, 2)
def local_dnn_convgw_inplace(node, inputs):
    return [GpuDnnConvGradW(algo=node.op.algo, inplace=True)(*inputs)]


@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]

optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
                                 local_dnn_convgw_inplace,
                                 local_dnn_convgi_inplace,
                                 name="local_dnna_conv_inplace"),
               70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')


@register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]


@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5)
def local_dnn_convw_alpha_merge(node, *inputs):
    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
Esempio n. 23
0
        dout = T.as_tensor_variable(dout)
        return [dout]


@gof.local_optimizer([Contiguous], inplace=True)
def opt_remove_contiguous(node):
    if isinstance(node.op, Contiguous):
        x, = node.inputs
        if x.owner and isinstance(
                x.owner.op,
            (T.Alloc, T.AllocEmpty, T.extra_ops.CpuContiguous)):
            return [x]
    return False


optdb.register('opt_remove_contiguous',
               gof.TopoOptimizer(opt_remove_contiguous), 10, 'fast_run')


# Theano will not do this optimization. So we register it now.
# See: https://github.com/Theano/Theano/issues/4400
@try_register_gpu_opt(Contiguous)
def local_gpu_Contiguous(node):
    if isinstance(node.op, Contiguous):
        # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py
        from theano.sandbox.cuda import host_from_gpu
        x, = node.inputs
        if x.owner and x.owner.op == host_from_gpu:
            from theano.sandbox.cuda.basic_ops import gpu_contiguous
            return [host_from_gpu(gpu_contiguous(x.owner.inputs[0]))]

Esempio n. 24
0
    if len(nw_inner) != len(op_ins):
        op_outs = scan_utils.clone(op_outs, replace=givens)
        nw_info = op.info.copy()
        nw_info['n_seqs'] = nw_n_seqs
        # DEBUG CHECK
        nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
        nw_outs = nwScan.make_node(*nw_outer).outputs
        return nw_outs
    else:
        return False

scan_seqopt = theano.gof.SequenceDB()
# We run before blas opt at 1.7 and specialize 2.0
# but after stabilize at 1.5. Should we put it before stabilize?
optdb.register('scan_seqopt', scan_seqopt, 1.6, 'fast_run', 'scan')
scan_seqopt.register('scanOp_remove_constants_and_unused_inputs',
                     opt.in2out(remove_constants_and_unused_inputs_scan,
                                ignore_newtrees=True),
                     5,
                     'fast_run',
                     'scan')


# This is a global opt for historical reason
# It should be possible to change it to a local opt.
class PushOutNonSeqScan(gof.Optimizer):

    def __init__(self):
        gof.Optimizer.__init__(self)
Esempio n. 25
0
            assert all([isinstance(i, int) or isinstance(i, Variable)
                for i in size]), msg
        else:
            msg = "size must be a tuple of int or a Theano variable"
            assert isinstance(size, Variable) and size.ndim == 1, msg
        generator = theano.shared(False)  # makes a generic
        s_size = theano.tensor.as_tensor_variable(size)
        u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size,
                self.next_seed())
        self.state_updates.append(u.update)
        rval = u * std + avg
        if u.type.broadcastable != rval.type.broadcastable:
            raise NotImplementedError(
                'Increase the size to match the broadcasting pattern of `low`'
                'and `high` arguments'
            )
        return  rval


@local_optimizer([CURAND_Base])
def local_destructive(node):
    op = node.op
    if isinstance(op, CURAND_Base) and not op.destructive:
        # op might be gpu version
        new_op = op.as_destructive()
        return new_op.make_node(*node.inputs).outputs
    return False
optdb.register('CURAND_destructive',
        opt.in2out(local_destructive, ignore_newtrees=True), 99, 'fast_run',
                   'inplace')
Esempio n. 26
0
                        GpuAdvancedIncSubtensor1,
                        GpuAdvancedIncSubtensor1_dev20)

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
                    'fast_compile', 'fast_run', 'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')


def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
        return local_opt
    return f

register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)

gpu_optimizer.register('local_remove_all_assert',
                       theano.tensor.opt.local_remove_all_assert,
                       'unsafe')
Esempio n. 27
0
            a = LargeSparseTargets(what_to_output=2).make_node(*fnode.inputs)
            f, g = a.outputs

            z = fnode.outputs[0]
            fgraph.replace_validate(z, f, "replace by a cost+grad op")

            for gnode in gnodes:
                z = gnode.outputs[0]
                fgraph.replace_validate(z, g, "replace by a cost+grad op")


mergelst = MergeLargeSparseTargetOps()
#optdb['specialize'].register('merge_large_sparse_target_ops', mergelst, 'fast_run')

optdb.register("global_large_sparse_targets_merge", mergelst, 48.5, "fast_run")


# add CPU TO GPU merge
#@register_specialize
#@local_optimizer([LargeSparseTargets])
def local_large_sparse_targets_gpu(node):
    if not isinstance(node.op,
                      LargeSparseTargets) or theano.config.device == "cpu":
        return False

    if node.op.what_to_output == 0:
        return [GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)]
    elif node.op.what_to_output == 1:
        return [
            host_from_gpu(
Esempio n. 28
0
                       'fast_compile', 'fast_run')
# Legacy convolution
conv_groupopt.register('local_conv2d_cpu', local_conv2d_cpu, 40,
                       'fast_compile', 'fast_run')
conv_groupopt.register('local_conv2d_gradweight_cpu',
                       local_conv2d_gradweight_cpu, 40, 'fast_compile',
                       'fast_run')
conv_groupopt.register('local_conv2d_gradinputs_cpu',
                       local_conv2d_gradinputs_cpu, 40, 'fast_compile',
                       'fast_run')


# Verify that no AbstractConv are present in the graph
@local_optimizer(
    [AbstractConv2d, AbstractConv2d_gradWeights, AbstractConv2d_gradInputs])
def local_abstractconv_check(node):
    if isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
                            AbstractConv2d_gradInputs)):
        raise AssertionError(
            '%s Theano optimization failed: there is no implementation '
            'available supporting the requested options. Did you exclude '
            'both "conv_dnn" and "conv_gemm" from the optimizer? If on GPU, '
            'is cuDNN available and does the GPU support it? If on CPU, '
            'do you have a BLAS library installed Theano can link against?' %
            node.op.__class__.__name__)


optdb.register('AbstracConvCheck',
               opt.in2out(local_abstractconv_check, name="AbstractConvCheck"),
               48.7, 'fast_compile', 'fast_run')
Esempio n. 29
0
        """
        if isinstance(size, tuple):
            msg = "size must be a tuple of int or a Theano variable"
            assert all([isinstance(i, int) or isinstance(i, Variable) for i in size]), msg
        else:
            msg = "size must be a tuple of int or a Theano variable"
            assert isinstance(size, Variable) and size.ndim == 1, msg
        generator = theano.shared(False)  # makes a generic
        s_size = theano.tensor.as_tensor_variable(size)
        u = CURAND_Normal.new_auto_update(generator, ndim, dtype, s_size, self.next_seed())
        self.state_updates.append(u.update)
        rval = u * std + avg
        if u.type.broadcastable != rval.type.broadcastable:
            raise NotImplementedError(
                "Increase the size to match the broadcasting pattern of `low`" "and `high` arguments"
            )
        return rval


@local_optimizer([CURAND_Base])
def local_destructive(node):
    op = node.op
    if isinstance(op, CURAND_Base) and not op.destructive:
        # op might be gpu version
        new_op = op.as_destructive()
        return new_op.make_node(*node.inputs).outputs
    return False


optdb.register("CURAND_destructive", opt.in2out(local_destructive, ignore_newtrees=True), 99, "fast_run", "inplace")
Esempio n. 30
0
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

# Don't register this right now
conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts"

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_compile', 'fast_run', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
                    'fast_compile', 'fast_run', 'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')


def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
        return local_opt
    return f


def register_inplace(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
        optdb.register(
Esempio n. 31
0
from basic_ops import host_from_gpu, gpu_from_host, gpu_alloc
from elemwise import GpuElemwise, _is_scalar

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
                    'fast_run', 'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')

def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
        return local_opt
    return f

register_opt()(theano.tensor.opt.local_track_shape_i)

class InputToGpuOptimizer(Optimizer):
    "Transfer the input to the gpu to start the rolling wave."

    def add_requirements(self, fgraph):
Esempio n. 32
0
    op = node.op
    if (isinstance(op, IfElse) and not op.as_view and
            # For big graph, do not make inplace scalar to speed up
            # optimization.
        (len(node.fgraph.apply_nodes) < 500
         or not all([getattr(o.type, "ndim", -1) == 0
                     for o in node.outputs]))):
        return IfElse(n_outs=op.n_outs, as_view=True, gpu=op.gpu,
                      name=op.name)(*node.inputs, **dict(return_list=True))
    return False


optdb.register(
    "cond_make_inplace",
    opt.in2out(cond_make_inplace, ignore_newtrees=True),
    95,
    "fast_run",
    "inplace",
)

# XXX: Optimizations commented pending further debugging (certain optimizations
# make computation less lazy than it should be currently).
#
# ifelse_equilibrium = gof.EquilibriumDB()
# ifelse_seqopt = gof.SequenceDB()
# ifelse_equilibrium.register('seq_ifelse', ifelse_seqopt, 'fast_run',
#                             'ifelse')
""" Comments:
I've wrote this comments to explain how the optimization of ifelse function
(for future developers that need to parse this part of code. Please try to
keep this comments in sync with whatever changes you add to the code.

BidirectionalTwoDLSTMOpGradNoInplaceInstance = BidirectionalTwoDLSTMOpGrad(
    inplace=False)
BidirectionalTwoDLSTMOpGradInplaceInstance = BidirectionalTwoDLSTMOpGrad(
    inplace=True)

BidirectionalTwoDLSTMOpInplaceOpt = OpSub(
    BidirectionalTwoDLSTMOpGradNoInplaceInstance,
    BidirectionalTwoDLSTMOpGradInplaceInstance)

#hack to avoid being called twice
if not hasattr(optdb, 'BidirectionalTwoDLSTMOpInplaceOpt_registered'):
    optdb.register(
        'BidirectionalTwoDLSTMOpInplaceOpt',
        theano.gof.TopoOptimizer(
            BidirectionalTwoDLSTMOpInplaceOpt,
            failure_callback=gof.TopoOptimizer.warn_inplace), 50.0, 'fast_run',
        'inplace', 'gpuarray')
    optdb.BidirectionalTwoDLSTMOpInplaceOpt_registered = True


class BidirectionalTwoDLSTMOp(theano.sandbox.cuda.GpuOp):
    __props__ = ()

    def __init__(self):
        super(BidirectionalTwoDLSTMOp, self).__init__()

    def make_node(self, X, W1, W2, V_h1, V_h2, V_v1, V_v2, b1, b2, sizes):
        var_names = [
            "X", "W1", "W2", "V_h1", "V_h2", "V_v1", "V_v2", "b1", "b2"
        ]
Esempio n. 34
0
    if len(nw_inner) != len(op_ins):
        op_outs = scan_utils.clone(op_outs, replace=givens)
        nw_info = copy.deepcopy(op.info)
        nw_info['n_seqs'] = nw_n_seqs
        # DEBUG CHECK
        nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
        nw_outs = nwScan.make_node(*nw_outer).outputs
        return nw_outs
    else:
        return False

scan_seqopt = theano.gof.SequenceDB()
# We run before blas opt at 1.7 and specialize 2.0
# but after stabilize at 1.5. Should we put it before stabilize?
optdb.register('scan_seqopt', scan_seqopt, 1.6, 'fast_run', 'scan')
scan_seqopt.register('scanOp_remove_constants_and_unused_inputs',
                     opt.in2out(remove_constants_and_unused_inputs_scan,
                                ignore_newtrees=True),
                     5,
                     'fast_run',
                     'scan')


# This is a global opt for historical reason
# It should be possible to change it to a local opt.
class PushOutNonSeqScan(gof.Optimizer):

    def __init__(self):
        gof.Optimizer.__init__(self)
Esempio n. 35
0
    if len(nw_inner) != len(op_ins):
        op_outs = scan_utils.clone(op_outs, replace=givens)
        nw_info = op.info.copy()
        nw_info["n_seqs"] = nw_n_seqs
        # DEBUG CHECK
        nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
        nw_outs = nwScan.make_node(*nw_outer).outputs
        return nw_outs
    else:
        return False


scan_seqopt = theano.gof.SequenceDB()
# We run before blas opt at 1.7 and specialize 2.0
# but after stabilize at 1.5. Should we put it before stabilize?
optdb.register("scan_seqopt", scan_seqopt, 1.6, "fast_run", "scan")
scan_seqopt.register(
    "scanOp_remove_constants_and_unused_inputs",
    opt.in2out(remove_constants_and_unused_inputs_scan, ignore_newtrees=True),
    5,
    "fast_run",
    "scan",
)


# This is a global opt for historical reason
# It should be possible to change it to a local opt.
class PushOutNonSeqScan(gof.Optimizer):
    def __init__(self):
        gof.Optimizer.__init__(self)
Esempio n. 36
0
    if len(nw_inner) != len(op_ins):
        op_outs = scan_utils.clone(op_outs, replace=givens)
        nw_info = op.info.copy()
        nw_info['n_seqs'] = nw_n_seqs
        # DEBUG CHECK
        nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
        nw_outs = nwScan.make_node(*nw_outer).outputs
        return nw_outs
    else:
        return False

scan_seqopt = theano.gof.SequenceDB()
# We run before blas opt at 1.7 and specialize 2.0
# but after stabilize at 1.5. Should we put it before stabilize?
optdb.register('scan_seqopt', scan_seqopt, 1.6, 'fast_run', 'scan')
scan_seqopt.register('scanOp_remove_constants_and_unused_inputs',
                     opt.in2out(remove_constants_and_unused_inputs_scan,
                                ignore_newtrees=True),
                     5,
                     'fast_run',
                     'scan')


# This is a global opt for historical reason
# It should be possible to change it to a local opt.
class PushOutNonSeqScan(gof.Optimizer):

    def __init__(self):
        gof.Optimizer.__init__(self)
Esempio n. 37
0
from theano.sandbox.gpuarray.subtensor import GpuSubtensor
from theano.sandbox.gpuarray.blas import GpuGemv, GpuGemm

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2, 'fast_run',
                    'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')


def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
        return local_opt

    return f


register_opt()(theano.tensor.opt.local_track_shape_i)

Esempio n. 38
0
    if len(nw_inner) != len(op_ins):
        op_outs = scan_utils.clone(op_outs, replace=givens)
        nw_info = copy.deepcopy(op.info)
        nw_info["n_seqs"] = nw_n_seqs
        # DEBUG CHECK
        nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
        nw_outs = nwScan.make_node(*nw_outer).outputs
        return nw_outs
    else:
        return False


scan_seqopt = theano.gof.SequenceDB()
# We run before blas opt at 1.7 and specialize 2.0
# but after stabilize at 1.5. Should we put it before stabilize?
optdb.register("scan_seqopt", scan_seqopt, 1.6, "fast_run", "scan")
scan_seqopt.register(
    "scanOp_remove_constants_and_unused_inputs",
    opt.in2out(remove_constants_and_unused_inputs_scan, ignore_newtrees=True),
    5,
    "fast_run",
    "scan",
)


# This is a global opt for historical reason
# It should be possible to change it to a local opt.
class PushOutNonSeqScan(gof.Optimizer):
    def __init__(self):
        gof.Optimizer.__init__(self)
Esempio n. 39
0
                                                                        0, 1):
                X = node.inputs[0]
                if hasattr(X.owner, "op") and type(
                        X.owner.op) == PoolHWBCOpGrad and X.owner.op.inplace:
                    fgraph.replace_validate(node.outputs[0], node.inputs[0])
                    replace_op = PoolHWBCOpGrad(X.owner.op.pool_shape,
                                                inplace=True,
                                                BCHW_grad_output=True)
                    fgraph.replace_validate(X.owner.outputs[0],
                                            replace_op(*X.owner.inputs))


RemoveConvGradDimshuffleOptimizer = RemoveConvGradDimshuffle()
if not hasattr(optdb, 'RemoveConvGradDimshuffleOptimizer_registered'):
    optdb.register('RemoveConvGradDimshuffle',
                   RemoveConvGradDimshuffleOptimizer, 50.5, 'fast_run',
                   'inplace', 'gpuarray')
    optdb.RemoveConvGradDimshuffleOptimizer_registered = True

#---------------------------


#for the moment we implement only ignore_border = True and no padding
class PoolHWBCOp(theano.sandbox.cuda.GpuOp):
    __props__ = ("pool_shape", )

    def __init__(self, pool_shape):
        pool_shape = tuple(pool_shape)
        super(PoolHWBCOp, self).__init__()
        assert len(pool_shape) == 2, len(pool_shape)
        assert pool_shape[0] > 0, pool_shape[0]
Esempio n. 40
0
                                               GpuAdvancedIncSubtensor1_dev20)
from theano.sandbox.gpuarray.type import GpuArrayConstant

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_compile', 'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
                    'fast_compile', 'fast_run', 'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')


def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
        return local_opt
    return f

register_opt('fast_compile')(theano.tensor.opt.local_track_shape_i)


def safe_to_gpu(x):
    if isinstance(x.type, tensor.TensorType):
Esempio n. 41
0
def inline_ofg_expansion(node):
    """
    This optimization expands internal graph of OpFromGraph.
    Only performed if node.op.is_inline == True
    Doing so can improve optimization at the cost of compilation speed.
    """
    op = node.op
    if not isinstance(op, OpFromGraph):
        return False
    if not op.is_inline:
        return False
    return theano.clone(
        op.local_outputs,
        {u: v
         for u, v in zip(node.op.local_inputs, node.inputs)})


# We want to run this before the first merge optimizer
# and before the first scan optimizer.
optdb.register(
    "inline_ofg_expansion",
    gof.opt.in2out(inline_ofg_expansion),
    -0.01,
    "fast_compile",
    "fast_run",
)

# Since OpFromGraph contains a Theano compiled function,
# we should let DebugMode know about it
ops_with_inner_function[OpFromGraph] = "fn"
Esempio n. 42
0
])
def local_abstractconv_check(node):
    if isinstance(
            node.op,
        (
            AbstractConv2d,
            AbstractConv2d_gradWeights,
            AbstractConv2d_gradInputs,
            AbstractConv3d,
            AbstractConv3d_gradWeights,
            AbstractConv3d_gradInputs,
        ),
    ):
        raise LocalMetaOptimizerSkipAssertionError(
            "%s Theano optimization failed: there is no implementation "
            "available supporting the requested options. Did you exclude "
            'both "conv_dnn" and "conv_gemm" from the optimizer? If on GPU, '
            "is cuDNN available and does the GPU support it? If on CPU, "
            "do you have a BLAS library installed Theano can link against? "
            "On the CPU we do not support float16." %
            node.op.__class__.__name__)


optdb.register(
    "AbstractConvCheck",
    in2out(local_abstractconv_check, name="AbstractConvCheck"),
    48.7,
    "fast_compile",
    "fast_run",
)
Esempio n. 43
0
@inplace_allocempty(GpuDnnConvGradI, 2)
def local_dnn_convgi_inplace(node, inputs):
    return [
        GpuDnnConvGradI(algo=node.op.algo,
                        inplace=True,
                        num_groups=node.op.num_groups)(*inputs)
    ]


optdb.register(
    "local_dnna_conv_inplace",
    theano.tensor.opt.in2out(
        local_dnn_conv_inplace,
        local_dnn_convgw_inplace,
        local_dnn_convgi_inplace,
        name="local_dnna_conv_inplace",
    ),
    70.0,
    "fast_run",
    "inplace",
    "gpuarray",
    "cudnn",
)


@register_opt("cudnn")
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5)
def local_dnn_conv_alpha_merge(node, *inputs):
    return [
        GpuDnnConv(algo=node.op.algo, num_groups=node.op.num_groups)(*inputs)
    ]
Esempio n. 44
0

@opt.register_opt()
@alpha_merge(Gemm16, alpha_in=1, beta_in=4)
def local_gemm16_alpha_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]


@opt.register_opt()
@output_merge(Gemm16, alpha_in=1, beta_in=4, out_in=0)
def local_gemm16_output_merge(node, *inputs):
    return [Gemm16(relu=node.op.relu)(*inputs)]


@local_optimizer([Gemm16], inplace=True)
def local_gemm16_inplace(node):
    if type(node.op) != Gemm16 or node.op.inplace:
        return
    inputs = list(node.inputs)
    C = inputs[0]
    if (C.owner and isinstance(C.owner.op, GpuAllocEmpty)
            and len(C.clients) > 1):
        inputs[0] = C.owner.op(*C.owner.inputs)
    return [Gemm16(relu=node.op.relu, inplace=True)(*inputs)]


optdb.register(
    'local_gemm16_inplace',
    tensor.opt.in2out(local_gemm16_inplace, name='local_gemm16_inplace'), 70.0,
    'fast_run', 'inplace', 'gpuarray')
Esempio n. 45
0
@local_optimizer([GpuDnnConvGradI], inplace=True)
def local_dnn_convgi_inplace(node):
    if type(node.op) != GpuDnnConvGradI or node.op.inplace:
        return
    inputs = list(node.inputs)
    dest = inputs[2]
    if (dest.owner and
            isinstance(dest.owner.op, GpuAllocEmpty) and
            len(dest.clients) > 1):
        inputs[2] = GpuAllocEmpty(dest.owner.op.dtype)(*dest.owner.inputs)
    return [GpuDnnConvGradI(algo=node.op.algo, inplace=True)(*inputs)]

optdb.register('local_dnna_conv_inplace',
               tensor.opt.in2out(local_dnn_conv_inplace,
                                 local_dnn_convgw_inplace,
                                 local_dnn_convgi_inplace,
                                 name="local_dnn_conv_inplace"),
               70.0, 'fast_run', 'inplace', 'gpuarray', 'cudnn')


@register_opt('cudnn')
@alpha_merge(GpuDnnConv, alpha_in=4, beta_in=5, nd=4)
def local_dnn_conv_alpha_merge(node, *inputs):
    return [GpuDnnConv(algo=node.op.algo)(*inputs)]


@register_opt('cudnn')
@alpha_merge(GpuDnnConvGradW, alpha_in=4, beta_in=5, nd=4)
def local_dnn_convw_alpha_merge(node, *inputs):
    return [GpuDnnConvGradW(algo=node.op.algo)(*inputs)]
Esempio n. 46
0
    """ % locals()

  #!!! change this when changing the code!
  #def c_code_cache_version(self):
  #  return 2, 7

MultiDirectionalTwoDLSTMOpGradNoInplaceInstance = MultiDirectionalTwoDLSTMOpGrad(inplace=False)
MultiDirectionalTwoDLSTMOpGradInplaceInstance = MultiDirectionalTwoDLSTMOpGrad(inplace=True)

MultiDirectionalTwoDLSTMOpInplaceOpt = OpSub(MultiDirectionalTwoDLSTMOpGradNoInplaceInstance,
                                             MultiDirectionalTwoDLSTMOpGradInplaceInstance)

#hack to avoid being called twice
if not hasattr(optdb, 'MultiDirectionalTwoDLSTMOpInplaceOpt_registered'):
  optdb.register('MultiDirectionalTwoDLSTMOpInplaceOpt',
                 theano.gof.TopoOptimizer(MultiDirectionalTwoDLSTMOpInplaceOpt, failure_callback=gof.TopoOptimizer.warn_inplace),
                 50.0, 'fast_run', 'inplace', 'gpuarray')
  optdb.MultiDirectionalTwoDLSTMOpInplaceOpt_registered = True


class MultiDirectionalTwoDLSTMOp(theano.sandbox.cuda.GpuOp):
  __props__ = ()

  def __init__(self):
    super(MultiDirectionalTwoDLSTMOp, self).__init__()

  def make_node(self, X, W1, W2, W3, W4, V_h1, V_h2, V_h3, V_h4, V_v1, V_v2, V_v3, V_v4, b1, b2, b3, b4, sizes):
    var_names = ["X", "W1", "W2", "W3", "W4", "V_h1", "V_h2", "V_h3", "V_h4",
                 "V_v1", "V_v2", "V_v3", "V_v4", "b1", "b2", "b3", "b4"]
    lcl = locals()
    for var_name in var_names:
Esempio n. 47
0
    else:
        return tuple(rval)


@gof.local_optimizer([None])
def cond_make_inplace(node):
    op = node.op
    if isinstance(op, IfElse) and not op.as_view:
        return IfElse(n_outs=op.n_outs,
                      as_view=True,
                      gpu=op.gpu,
                      name=op.name)(*node.inputs, **dict(return_list=True))
    return False


optdb.register('cond_make_inplace', opt.in2out(cond_make_inplace,
    ignore_newtrees=True), 95, 'fast_run', 'inplace')

# XXX: Optimizations commented pending further debugging (certain optimizations
# make computation less lazy than it should be currently).
#
# ifelse_equilibrium = gof.EquilibriumDB()
# ifelse_seqopt = gof.SequenceDB()
# ifelse_equilibrium.register('seq_ifelse', ifelse_seqopt, 'fast_run',
#                             'ifelse')
''' Comments:
I've wrote this comments to explain how the optimization of ifelse function
(for future developers that need to parse this part of code. Please try to
keep this comments in sync with whatever changes you add to the code.

ifelse optimization are registered before canonicalize !
Esempio n. 48
0
        return list(rval)
    else:
        return tuple(rval)


@gof.local_optimizer([IfElse])
def cond_make_inplace(node):
    op = node.op
    if isinstance(op, IfElse) and not op.as_view:
        return IfElse(n_outs=op.n_outs, as_view=True, gpu=op.gpu,
                      name=op.name)(*node.inputs, **dict(return_list=True))
    return False


optdb.register('cond_make_inplace',
               opt.in2out(cond_make_inplace, ignore_newtrees=True), 95,
               'fast_run', 'inplace')

# XXX: Optimizations commented pending further debugging (certain optimizations
# make computation less lazy than it should be currently).
#
# ifelse_equilibrium = gof.EquilibriumDB()
# ifelse_seqopt = gof.SequenceDB()
# ifelse_equilibrium.register('seq_ifelse', ifelse_seqopt, 'fast_run',
#                             'ifelse')
''' Comments:
I've wrote this comments to explain how the optimization of ifelse function
(for future developers that need to parse this part of code. Please try to
keep this comments in sync with whatever changes you add to the code.

ifelse optimization are registered before canonicalize !
Esempio n. 49
0
            a = LargeSparseTargets(what_to_output=2).make_node(*fnode.inputs)
            f, g = a.outputs

            z = fnode.outputs[0]
            fgraph.replace_validate(z, f, "replace by a cost+grad op")

            for gnode in gnodes:
                z = gnode.outputs[0]
                fgraph.replace_validate(z, g, "replace by a cost+grad op")


mergelst = MergeLargeSparseTargetOps()
#optdb['specialize'].register('merge_large_sparse_target_ops', mergelst, 'fast_run')

optdb.register("global_large_sparse_targets_merge", mergelst, 48.5, "fast_run")

# add CPU TO GPU merge
#@register_specialize
#@local_optimizer([LargeSparseTargets])
def local_large_sparse_targets_gpu(node):
    if not isinstance(node.op, LargeSparseTargets) or theano.config.device == "cpu":
        return False

    if node.op.what_to_output == 0:
        return [GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)]
    elif node.op.what_to_output == 1:
        return [host_from_gpu(GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs))]
    else:
        out = GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)
        return [out[0], host_from_gpu(out[1])]
Esempio n. 50
0
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

# Don't register this right now
conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts"

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_compile', 'fast_run', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
                    'fast_compile', 'fast_run', 'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')


def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
        return local_opt

    return f


def register_inplace(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
Esempio n. 51
0
    if len(nw_inner) != len(op_ins):
        op_outs = scan_utils.clone(op_outs, replace=givens)
        nw_info = copy.deepcopy(op.info)
        nw_info['n_seqs'] = nw_n_seqs
        # DEBUG CHECK
        nwScan = scan_op.Scan(nw_inner, op_outs, nw_info)
        nw_outs = nwScan.make_node(*nw_outer).outputs
        return nw_outs
    else:
        return False

scan_seqopt = theano.gof.SequenceDB()
# We run before blas opt at 1.7 and specialize 2.0
# but after stabilize at 1.5. Should we put it before stabilize?
optdb.register('scan_seqopt', scan_seqopt, 1.6, 'fast_run', 'scan')
scan_seqopt.register('scanOp_remove_constants_and_unused_inputs',
                     opt.in2out(remove_constants_and_unused_inputs_scan,
                                ignore_newtrees=True),
                     5,
                     'fast_run',
                     'scan')


# This is a global opt for historical reason
# It should be possible to change it to a local opt.
class PushOutNonSeqScan(gof.Optimizer):

    def __init__(self):
        gof.Optimizer.__init__(self)
Esempio n. 52
0
    if isinstance(op, RandomFunction) and not op.inplace:
        # Read op_fn from op.state, not from op.fn, since op.fn
        # may not be picklable.
        op_fn, op_outtype, op_inplace, op_ndim_added = op._props()
        new_op = RandomFunction(op_fn,
                                op_outtype,
                                inplace=True,
                                ndim_added=op_ndim_added)
        return new_op.make_node(*node.inputs).outputs
    return False


optdb.register(
    "random_make_inplace",
    opt.in2out(random_make_inplace, ignore_newtrees=True),
    99,
    "fast_run",
    "inplace",
)


class RandomStreamsBase(object):
    def binomial(self,
                 size=None,
                 n=1,
                 p=0.5,
                 ndim=None,
                 dtype="int64",
                 prob=None):
        """
        Sample n times with probability of success p for each trial and
Esempio n. 53
0
    #!!! change this when changing the code!
    def c_code_cache_version(self):
        return 1, 7


BLSTMOpGradNoInplaceInstance = BLSTMOpGrad(inplace=False)
BLSTMOpGradInplaceInstance = BLSTMOpGrad(inplace=True)

BLSTMOpGradInplaceOpt = OpSub(BLSTMOpGradNoInplaceInstance,
                              BLSTMOpGradInplaceInstance)

#hack to avoid being called twice
if not hasattr(optdb, 'BLSTMOpGradInplaceOpt_registered'):
    optdb.register('BLSTMOpGradInplaceOpt',
                   theano.gof.TopoOptimizer(BLSTMOpGradInplaceOpt), 50.0,
                   'fast_run', 'inplace', 'gpuarray')
    optdb.BLSTMOpGradInplaceOpt_registered = True

#------------------------


class BLSTMOp(theano.sandbox.cuda.GpuOp):
    def __init__(self, inplace):
        self.inplace = inplace
        if inplace:
            #all outputs operate inplace on input 0 (which is Z)
            #but when the input is marked multiple times, we get an error
            #so we only mark that output 0 destroys input 0
            #anyway theano knows that input 0 will be destroyed, so it should be OK
            #TODO
Esempio n. 54
0
from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
from theano.sandbox.gpuarray.type import GpuArrayConstant

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

gpu_seqopt.register('gpuarray_local_optimiziations', gpu_optimizer, 1,
                    'fast_run', 'inplace', 'gpuarray')
gpu_seqopt.register('gpuarray_cut_transfers', gpu_cut_copies, 2,
                    'fast_run', 'gpuarray')

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register('gpuarray_opt', gpu_seqopt,
               optdb.__position__.get('add_destroy_handler', 49.5) - 1,
               'gpuarray')


def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop('name')) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, 'fast_run', 'gpuarray', *tags)
        return local_opt
    return f

register_opt()(theano.tensor.opt.local_track_shape_i)


def op_lifter(OP):
    """
Esempio n. 55
0
            output[0] = variable.copy()


@gof.local_optimizer([OpFromGraph])
def inline_ofg_expansion(node):
    """
    This optimization expands internal graph of OpFromGraph.
    Only performed if node.op.is_inline == True
    Doing so can improve optimization at the cost of compilation speed.
    """
    op = node.op
    if not isinstance(op, OpFromGraph):
        return False
    if not op.is_inline:
        return False
    return theano.clone(
        op.local_outputs, {
            u: v for u, v in izip(
                node.op.local_inputs, node.inputs)})

# We want to run this before the first merge optimizer
# and before the first scan optimizer.
optdb.register(
    'inline_ofg_expansion',
    gof.opt.in2out(inline_ofg_expansion),
    -0.01, 'fast_compile', 'fast_run')

# Since OpFromGraph contains a Theano compiled function,
# we should let DebugMode know about it
ops_with_inner_function[OpFromGraph] = 'fn'
Esempio n. 56
0
from theano.compile import optdb
from theano.gof import local_optimizer, LocalOptGroup
from theano.tensor.opt import in2out


@local_optimizer([gpugemv_no_inplace], inplace=True)
def local_inplace_gpuagemv(node):
    if node.op == gpugemv_no_inplace:
        return [gpugemv_inplace(*node.inputs)]


@local_optimizer([gpugemm_no_inplace], inplace=True)
def local_inplace_gpuagemm(node):
    if node.op == gpugemm_no_inplace:
        return [gpugemm_inplace(*node.inputs)]


@local_optimizer([gpuger_no_inplace], inplace=True)
def local_inplace_gpuager(node):
    if node.op == gpuger_no_inplace:
        return [gpuger_inplace(*node.inputs)]


gpuablas_opt_inplace = in2out(LocalOptGroup(local_inplace_gpuagemv,
                                            local_inplace_gpuagemm,
                                            local_inplace_gpuager),
                              name='gpuablas_opt_inplace')
optdb.register('InplaceGpuaBlasOpt', gpuablas_opt_inplace, 70.0, 'fast_run',
               'inplace', 'gpuarray')
Esempio n. 57
0
    GpuSoftmax,
)
from theano.sandbox.gpuarray.elemwise import GpuElemwise, _is_scalar, GpuDimShuffle, GpuCAReduceCuda
from theano.sandbox.gpuarray.subtensor import GpuIncSubtensor, GpuSubtensor
from theano.sandbox.gpuarray.type import GpuArrayConstant

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

gpu_seqopt.register("gpuarray_local_optimiziations", gpu_optimizer, 1, "fast_run", "inplace", "gpuarray")
gpu_seqopt.register("gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_run", "gpuarray")

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register("gpuarray_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpuarray")


def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags)
        return local_opt

    return f


register_opt()(theano.tensor.opt.local_track_shape_i)


def op_lifter(OP):
Esempio n. 58
0
_logger = logging.getLogger("theano.sandbox.gpuarray.opt")

gpu_optimizer = EquilibriumDB()
gpu_cut_copies = EquilibriumDB()

gpu_seqopt = SequenceDB()

# Don't register this right now
conv_groupopt = LocalGroupDB()
conv_groupopt.__name__ = "gpua_conv_opts"

gpu_seqopt.register("gpuarray_local_optimiziations", gpu_optimizer, 1, "fast_compile", "fast_run", "gpuarray")
gpu_seqopt.register("gpuarray_cut_transfers", gpu_cut_copies, 2, "fast_compile", "fast_run", "gpuarray")

# do not add 'fast_run' to these two as this would always enable gpuarray mode
optdb.register("gpuarray_opt", gpu_seqopt, optdb.__position__.get("add_destroy_handler", 49.5) - 1, "gpuarray")


def register_opt(*tags, **kwargs):
    def f(local_opt):
        name = (kwargs and kwargs.pop("name")) or local_opt.__name__
        gpu_optimizer.register(name, local_opt, "fast_run", "gpuarray", *tags)
        return local_opt

    return f


register_opt("fast_compile")(theano.tensor.opt.local_track_shape_i)

gpu_optimizer.register("local_remove_all_assert", theano.tensor.opt.local_remove_all_assert, "unsafe")
Esempio n. 59
0
        return final_samples


from theano.sandbox.gpuarray.opt import register_opt as register_gpua, host_from_gpu as host_from_gpua


@register_gpua()
@local_optimizer([mrg_uniform])
def local_gpua_mrg(node):
    if type(node.op) == mrg_uniform and isinstance(node.inputs[0].type, GpuArrayType):
        outs = GPUA_mrg_uniform.new(node.inputs[0], node.op.output_type.ndim, node.op.output_type.dtype, node.inputs[1])
        return [outs[0], host_from_gpua(outs[1])]


MRG_RNGs = (mrg_uniform, GPU_mrg_uniform, GPUA_mrg_uniform)


@local_optimizer(MRG_RNGs)
def mrg_random_make_inplace(node):
    op = node.op
    if isinstance(op, MRG_RNGs) and not op.inplace:
        # op might be gpu version
        new_op = op.__class__(op.output_type, inplace=True)
        return new_op.make_node(*node.inputs).outputs
    return False


optdb.register(
    "random_make_inplace_mrg", opt.in2out(mrg_random_make_inplace, ignore_newtrees=True), 99, "fast_run", "inplace"
)