Ejemplo n.º 1
0
def local_large_sparse_targets_gpu(node):
    if not isinstance(node.op, LargeSparseTargets) or theano.config.device == "cpu":
        return False

    if node.op.what_to_output == 0:
        return [GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)]
    elif node.op.what_to_output == 1:
        return [host_from_gpu(GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs))]
    else:
        out = GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)
        return [out[0], host_from_gpu(out[1])]
Ejemplo n.º 2
0
def local_large_sparse_targets_gpu(node):
    if not isinstance(node.op,
                      LargeSparseTargets) or theano.config.device == "cpu":
        return False

    if node.op.what_to_output == 0:
        return [GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)]
    elif node.op.what_to_output == 1:
        return [
            host_from_gpu(
                GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs))
        ]
    else:
        out = GpuLargeSparseTargets(node.op.what_to_output)(*node.inputs)
        return [out[0], host_from_gpu(out[1])]
Ejemplo n.º 3
0
def use_gpu_images2neibs(node):
    if type(node.op) is Images2Neibs:
        return [
            host_from_gpu(
                gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode)
            )
        ]
Ejemplo n.º 4
0
 def local_gpu_minres(node):
     if isinstance(node.op, MinresQLP):
         sw = False
         for inp in node.inputs:
             if inp.owner and inp.owner.op == host_from_gpu:
                 sw = True
         if sw:
             inps = node.inputs
             nw_inps = []
             for inp in inps:
                 if not isinstance(inp.type, CudaNdarrayType):
                     nw_inps.append(gpu_from_host(inp))
                 else:
                     nw_inps.append(inp)
             new_op = node.op
             new_op.gpu = 1
             _new_outs = node.op(*nw_inps)
             new_outs = []
             for out in _new_outs:
                 if isinstance(out.type, CudaNdarrayType):
                     new_outs.append(host_from_gpu(out))
                 else:
                     new_outs.append(out)
             return new_outs
         else:
             return False
Ejemplo n.º 5
0
def use_gpu_images2neibs(node):
    if (type(node.op) is Images2Neibs and
        node.inputs[0].dtype == 'float32' and
        node.op.mode in ['valid', 'wrap_centered']):
        return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]),
                                               node.inputs[1], node.inputs[2],
                                               mode=node.op.mode))]
Ejemplo n.º 6
0
def local_gpu_multinomial(node):
    if type(node.op) is MultinomialFromUniform:
        p, u = node.inputs
        m, = node.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32' and any([
                i.owner
                and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu)
                for i in node.inputs
        ])):
            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
            return [
                host_from_gpu(gpu_op(*[gpu_from_host(i)
                                       for i in node.inputs])).T
            ]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost)
            and node.inputs[0].owner
            and type(node.inputs[0].owner.op) is MultinomialFromUniform):
        multi = node.inputs[0].owner
        p, u = multi.inputs
        m, = multi.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32'):
            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
            ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T
            # The dimshuffle is on the cpu, but will be moved to the gpu by an opt.
            return [gpu_from_host(ret)]
Ejemplo n.º 7
0
 def local_gpu_minres(node):
     if isinstance(node.op, MinresQLP):
         sw = False
         for inp in node.inputs:
             if inp.owner and inp.owner.op == host_from_gpu:
                 sw = True
         if sw:
             inps = node.inputs
             nw_inps = []
             for inp in inps:
                 if not isinstance(inp.type, CudaNdarrayType):
                     nw_inps.append(gpu_from_host(inp))
                 else:
                     nw_inps.append(inp)
             new_op = node.op
             new_op.gpu = 1
             _new_outs = node.op(*nw_inps)
             new_outs = []
             for out in _new_outs:
                 if isinstance(out.type, CudaNdarrayType):
                     new_outs.append(host_from_gpu(out))
                 else:
                     new_outs.append(out)
             return new_outs
         else:
             return False
Ejemplo n.º 8
0
def use_gpu_cumsum(node):
    if type(node.op) is CumOp \
       and node.inputs[0].dtype == 'float32' \
       and node.inputs[0].owner \
       and isinstance(node.inputs[0].owner.op, HostFromGpu):

        if node.op.mode != 'add':
            return None

        axis = node.op.axis
        x = node.inputs[0]

        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            return None

        x = gpu_from_host(x)

        if axis is None and x.ndim > 1:
            x = gpu_flatten(x)

        # ``gpu_cumsum`` assume array has been flattened if needed.
        if axis is None:
            axis = 0

        ret = host_from_gpu(GpuCumsum(axis)(x))
        ret.tag.values_eq_approx = values_eq_approx_high_tol
        return [ret]
Ejemplo n.º 9
0
def use_gpu_images2neibs(node):
    if (type(node.op) is Images2Neibs and
        node.inputs[0].dtype == 'float32' and
        node.op.mode in ['valid', 'ignore_borders',
                         'wrap_centered']):
        return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]),
                                               node.inputs[1], node.inputs[2],
                                               mode=node.op.mode))]
Ejemplo n.º 10
0
def local_gpu_advanced_incsubtensor1_scal_floats(node):
    supported_dims = {
        # x.ndim, y.ndim
        (1, 0): GpuAdvancedIncSubtensor1Floats_scal_dev20,
        (2, 2): GpuAdvancedIncSubtensor1Floats_dev20,
    }

    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        # Should not execute for GpuAdvancedIncSubtensor1
        if host_input.owner and \
           host_input.owner.op.__class__ is AdvancedIncSubtensor1Floats:
            x, y = host_input.owner.inputs[0:2]
            dims = (x.ndim, y.ndim)
            if dims not in supported_dims.keys():
                return False

            coords = host_input.owner.inputs[2:]
            set_instead_of_inc = host_input.owner.op.set_instead_of_inc
            inplace = host_input.owner.op.inplace

            gpu_op = supported_dims[dims](
                inplace=inplace, set_instead_of_inc=set_instead_of_inc)
            return [
                gpu_op(as_cuda_ndarray_variable(x),
                       as_cuda_ndarray_variable(y), *coords)
            ]

    # Should not execute for GpuAdvancedIncSubtensor1
    if (node.op.__class__ is AdvancedIncSubtensor1Floats
            and node.inputs[0].dtype == "float32"
            and node.inputs[1].dtype == "float32"
            and node.inputs[2].dtype == "float32"):
        x, y = node.inputs[0:2]
        dims = (x.ndim, y.ndim)
        if dims not in supported_dims:
            return False

        coords = node.inputs[2:]
        go_gpu = False
        if x.owner and isinstance(x.owner.op, HostFromGpu):
            go_gpu = True
            gpu_x, = x.owner.inputs
        else:
            gpu_x = as_cuda_ndarray_variable(x)
        if y.owner and isinstance(y.owner.op, HostFromGpu):
            go_gpu = True
            gpu_y, = y.owner.inputs
        else:
            gpu_y = as_cuda_ndarray_variable(y)
        if go_gpu:
            set_instead_of_inc = node.op.set_instead_of_inc
            inplace = node.op.inplace

            gpu_op = supported_dims[dims](
                inplace=inplace, set_instead_of_inc=set_instead_of_inc)
            return [host_from_gpu(gpu_op(gpu_x, gpu_y, *coords))]
    return False
Ejemplo n.º 11
0
def use_gpu_images2neibs(node):
    if type(node.op) is Images2Neibs:
        return [
            host_from_gpu(
                gpu_images2neibs(gpu_from_host(node.inputs[0]),
                                 node.inputs[1],
                                 node.inputs[2],
                                 mode=node.op.mode))
        ]
Ejemplo n.º 12
0
def local_gpu_conv3d(node):
    if isinstance(node.op, Conv3D):
        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
                      for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                V, W, b, d = node.inputs
                return [host_from_gpu(gpu_convd(as_cuda_ndarray_variable(V),
                                                as_cuda_ndarray_variable(W),
                                                as_cuda_ndarray_variable(b),
                                                d))]
Ejemplo n.º 13
0
def local_gpu_argmax(node):
    if type(node.op) is KArgmax:
        p, = node.inputs
        vals, indx, = node.outputs
        if (p.dtype == vals.dtype == 'float32' and
            any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])):
            gpu_op = GpuKArgmax(node.op.K)
            ret_vals, ret_indx = gpu_op(gpu_from_host(p))
            return [host_from_gpu(ret_vals), T.cast(host_from_gpu(ret_indx), "int32")]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
        node.inputs[0].owner and type(node.inputs[0].owner.op)
        is KArgmax):
        multi = node.inputs[0].owner
        p, = multi.inputs
        vals, indx, = multi.outputs
        if (p.dtype == vals.dtype == 'float32'):
            gpu_op = GpuKArgmax(node.inputs[0].owner.op.K)
            ret_vals, ret_indx = gpu_op(gpu_from_host(p)) 
            return [gpu_from_host(ret_vals), gpu_from_host(ret_indx)]
Ejemplo n.º 14
0
	def save_data(self, filename, data):
		if type(data) != type(np.asarray([])):
			data = host_from_gpu(data)
			data = np.asarray(data.eval())
		mult = lambda x, y: x * y
		length = reduce(mult, data.shape)
		data = data.reshape(length)
		data = "\n".join([str(i) for i in data])
		f = open(filename, "w")
		f.write(data)
		f.close()
Ejemplo n.º 15
0
def use_gpu_images2neibs(node):
    if (
        type(node.op) is Images2Neibs
        and node.inputs[0].dtype == "float32"
        and node.op.mode in ["valid", "ignore_borders", "wrap_centered"]
    ):
        return [
            host_from_gpu(
                gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode)
            )
        ]
Ejemplo n.º 16
0
def save_weights(weights, filename):
    """ Taken from the convnet code. Deals with network calculated
		on a gpu
	"""
    length = reduce(lambda x, y: x * y, weights.shape.eval())
    data = host_from_gpu(weights).eval()
    data = np.asarray(data)
    data = data.reshape(length)
    data = "\n".join([str(i) for i in data])
    f = open(filename, "w")
    f.write(data)
    f.close()
Ejemplo n.º 17
0
	def save_data(self, filename, data, gpu = False):
		mult = lambda x, y: x * y
		if gpu:
			length = reduce(mult, data.shape.eval())
			data = host_from_gpu(data).eval()
			data = np.asarray(data)
		else:
			length = reduce(mult, data.shape)
		data = data.reshape(length)
		data = "\n".join([str(i) for i in data])
		f = open(filename, "w")
		f.write(data)
		f.close()
Ejemplo n.º 18
0
 def save_data(self, filename, data, gpu=False):
     mult = lambda x, y: x * y
     if gpu:
         length = reduce(mult, data.shape.eval())
         data = host_from_gpu(data).eval()
         data = np.asarray(data)
     else:
         length = reduce(mult, data.shape)
     data = data.reshape(length)
     data = "\n".join([str(i) for i in data])
     f = open(filename, "w")
     f.write(data)
     f.close()
Ejemplo n.º 19
0
def local_gpu_conv3d(node):
    if isinstance(node.op, Conv3D):
        if numpy.any([
                i.owner and isinstance(i.owner.op, HostFromGpu)
                for i in node.inputs
        ]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                V, W, b, d = node.inputs
                return [
                    host_from_gpu(
                        gpu_convd(as_cuda_ndarray_variable(V),
                                  as_cuda_ndarray_variable(W),
                                  as_cuda_ndarray_variable(b), d))
                ]
Ejemplo n.º 20
0
def grab_cpu_scalar(v, nd):
    if v.owner is not None:
        n = v.owner
        if (isinstance(n.op, GpuDimShuffle)
                and n.op.new_order == ('x', ) * nd):
            return host_from_gpu(n.inputs[0])
        elif (isinstance(n.op, DimShuffle) and n.op.new_order == ('x', ) * nd):
            return n.inputs[0]
        elif isinstance(n.op, GpuFromHost):
            return grab_cpu_scalar(n.inputs[0], nd=nd)
        else:
            return None
    else:
        if (isinstance(v, Constant) and v.broadcastable == (True, ) * nd):
            return v.dimshuffle(())
Ejemplo n.º 21
0
def grab_cpu_scalar(v, nd):
    if v.owner is not None:
        n = v.owner
        if (isinstance(n.op, GpuDimShuffle) and
                n.op.new_order == ('x',) * nd):
            return host_from_gpu(n.inputs[0])
        elif (isinstance(n.op, DimShuffle) and
              n.op.new_order == ('x',) * nd):
            return n.inputs[0]
        elif isinstance(n.op, GpuFromHost):
            return grab_cpu_scalar(n.inputs[0], nd=nd)
        else:
            return None
    else:
        if (isinstance(v, Constant) and
                v.broadcastable == (True,) * nd):
            return v.dimshuffle(())
Ejemplo n.º 22
0
 def local_gpu_forloop(node):
     if isinstance(node.op, forloop):
         sw = False
         for inp in node.inputs:
             if inp.owner and inp.owner.op == host_from_gpu:
                 sw = True
         if sw:
             inps = node.inputs
             nw_inps = []
             for inp in inps:
                 if not isinstance(inp.type, CudaNdarrayType):
                     nw_inps.append(gpu_from_host(inp))
                 else:
                     nw_inps.append(inp)
             new_outs = node.op(*nw_inps)
             return [host_from_gpu(x) for x in new_outs]
         else:
             return False
Ejemplo n.º 23
0
def local_gpu_multinomial(node):
    # TODO : need description for function
    if type(node.op) is MultinomialFromUniform:
        if len(node.inputs) == 2:
            p, u = node.inputs
            n_samples = 1
        else:
            p, u, n_samples = node.inputs
        try:
            if get_scalar_constant_value(n_samples) != 1:
                return None
        except NotScalarConstantError:
            return None
        m, = node.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32' and any([
                i.owner
                and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu)
                for i in node.inputs
        ])):
            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
            return [
                host_from_gpu(gpu_op(*[gpu_from_host(i) for i in [p, u]])).T
            ]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost)
            and node.inputs[0].owner
            and type(node.inputs[0].owner.op) is MultinomialFromUniform):
        multi = node.inputs[0].owner
        if len(node.inputs) == 2:
            p, u = node.inputs
            n_samples = 1
        else:
            p, u, n_samples = node.inputs
        try:
            if get_scalar_constant_value(n_samples) != 1:
                return None
        except NotScalarConstantError:
            return None
        m, = multi.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32'):
            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
            ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T
            # The dimshuffle is on the cpu, but will be moved to the
            # gpu by an opt.
            return [gpu_from_host(ret)]
Ejemplo n.º 24
0
def local_gpu_multinomial(node):
    if type(node.op) is MultinomialFromUniform:
        p, u = node.inputs
        m, = node.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32' and
            any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu)
                 for i in node.inputs])):
            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
            return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
        node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform):
        multi = node.inputs[0].owner
        p, u = multi.inputs
        m, = multi.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32'):
            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
            ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T
            # The dimshuffle is on the cpu, but will be moved to the gpu by an opt.
            return [gpu_from_host(ret)]
Ejemplo n.º 25
0
def local_assigner(node):
    if type(node.op) is Assigner:
        p, indx, gr, = node.inputs
        vals, = node.outputs
        if (p.dtype == vals.dtype == 'float32' and
            any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])):
            gpu_op = GpuAssigner()
            ret = gpu_op(gpu_from_host(p),indx,gpu_from_host(gr))
            return [host_from_gpu(ret),]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
        node.inputs[0].owner and type(node.inputs[0].owner.op)
        is Assigner):
        multi = node.inputs[0].owner
        p,indx,gr = multi.inputs
        vals, = multi.outputs
        if (p.dtype == vals.dtype == 'float32'):
            gpu_op = GpuAssigner()
            ret_vals = gpu_op(gpu_from_host(p),indx,gpu_from_host(gr)) 
            return [gpu_from_host(ret_vals)]
Ejemplo n.º 26
0
def local_gpu_multinomial(node):
    # TODO : need description for function
    if type(node.op) is MultinomialFromUniform:
        if len(node.inputs) == 2:
            p, u = node.inputs
            n_samples = 1
        else:
            p, u, n_samples = node.inputs
        try:
            if get_scalar_constant_value(n_samples) != 1:
                return None
        except NotScalarConstantError:
            return None
        m, = node.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32' and
            any([i.owner and isinstance(i.owner.op,
                                        theano.sandbox.cuda.HostFromGpu)
                 for i in node.inputs])):
            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
            return [host_from_gpu(gpu_op(*[gpu_from_host(i)
                                           for i in [p, u]])).T]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
            node.inputs[0].owner and
            type(node.inputs[0].owner.op) is MultinomialFromUniform):
        multi = node.inputs[0].owner
        if len(node.inputs) == 2:
            p, u = node.inputs
            n_samples = 1
        else:
            p, u, n_samples = node.inputs
        try:
            if get_scalar_constant_value(n_samples) != 1:
                return None
        except NotScalarConstantError:
            return None
        m, = multi.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32'):
            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
            ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T
            # The dimshuffle is on the cpu, but will be moved to the
            # gpu by an opt.
            return [gpu_from_host(ret)]
Ejemplo n.º 27
0
def local_gpu_join_unsafe(node):
    """
    Inspired by the opt for convop.
    Very loose notation follows.
    Subgraphs concerned first look like
        [array of HostTensor] -> HostToGpu -> GpuToHost
        -> Join -> HostToGpu -> GpuToHost
    First we apply this Opt:
    join(host_from_gpu) -> host_from_gpu(gpu_join)
    then, as an intermediate result, there should be
    host_from_gpu(gpu_join) -> HostToGpu -> GpuToHost
    this unnecessary GpuToHost -> HostToGpu should be removed
    by other opts, leaving us with
    host_from_gpu(gpu_join)
    For intermediate places in the graph not covered by the first opt, the
    following could be useful:
    gpu_from_host(join) -> gpu_join(gpu_from_host)
    not implemented yet.
    """
    if isinstance(node.op, JoinUnsafe):
        # optimizing this case:
        # join(host_from_gpu) -> host_from_gpu(gpu_join)

        axis_and_tensors = node.inputs

        matches = [
            t.dtype == 'float32'
            and ((t.owner is not None and isinstance(t.owner.op, HostFromGpu))
                 or isinstance(t, theano.gof.Constant))
            for t in axis_and_tensors[1:]
        ]

        if all(matches):
            new_tensors = [
                as_cuda_ndarray_variable(t) for t in axis_and_tensors[1:]
            ]
            new_a_and_t = [axis_and_tensors[0]] + new_tensors

            replacement_node = host_from_gpu(GpuJoinUnsafe()(*new_a_and_t))

            return [replacement_node]
Ejemplo n.º 28
0
def use_gpu_cumsum(node):
    if type(node.op) is CumsumOp \
       and node.inputs[0].dtype == 'float32' \
       and node.inputs[0].owner \
       and isinstance(node.inputs[0].owner.op, HostFromGpu):

        axis = node.op.axis
        x = node.inputs[0]

        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            return None

        x = gpu_from_host(x)

        if axis is None and x.ndim > 1:
            x = GpuFlatten()(x)

        # ``gpu_cumsum`` assume array has been flattened if needed.
        if axis is None:
            axis = 0

        return [host_from_gpu(GpuCumsum(axis)(x))]
Ejemplo n.º 29
0
def use_gpu_cumsum(node):
    if type(node.op) is CumsumOp \
       and node.inputs[0].dtype == 'float32' \
       and node.inputs[0].owner \
       and isinstance(node.inputs[0].owner.op, HostFromGpu):

        axis = node.op.axis
        x = node.inputs[0]

        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            return None

        x = gpu_from_host(x)

        if axis is None and x.ndim > 1:
            x = GpuFlatten()(x)

        # ``gpu_cumsum`` assume array has been flattened if needed.
        if axis is None:
            axis = 0

        return [host_from_gpu(GpuCumsum(axis)(x))]
Ejemplo n.º 30
0
def local_gpu_advanced_subtensor1_floats(node):
    if isinstance(node.op, GpuFromHost):
        host_input = node.inputs[0]
        if host_input.owner and \
           host_input.owner.op.__class__ is AdvancedSubtensor1Floats:
            x = host_input.owner.inputs[0]
            coords = host_input.owner.inputs[1:]
            return [
                GpuAdvancedSubtensor1Floats(host_input.owner.op._tag)(
                    as_cuda_ndarray_variable(x), *coords)
            ]
    if node.op.__class__ is AdvancedSubtensor1Floats:
        x = node.inputs[0]
        coords = node.inputs[1:]
        # print x.owner.op, x.type, node.op._tag # DEV
        if (x.owner and isinstance(x.owner.op, HostFromGpu)
                and x.dtype == "float32"):
            gpu_x, = x.owner.inputs
            return [
                host_from_gpu(
                    GpuAdvancedSubtensor1Floats(node.op._tag)(gpu_x, *coords))
            ]
    return False
Ejemplo n.º 31
0
            input1_nervana = to_gputensor(inputs[0][0])
            input2_nervana = to_gputensor(inputs[1][0])
            output_nervana = to_gputensor(z[0])

            lib.dot(input1_nervana, input2_nervana, output_nervana,
                               alpha=1, beta=0, relu=self.relu)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk

nervana_dot = NervanaDot()


if __name__ == "__main__":
    import theano.tensor as T

    x = theano.shared(np.random.randn(2000, 3000).astype(theano.config.floatX))
    y = theano.shared(np.random.randn(3000, 1000).astype(theano.config.floatX))

    prod1 = T.dot(x, y)
    prod2 = host_from_gpu(nervana_dot(x, y))

    val1 = prod1.eval()
    val2 = prod2.eval()

    assert np.allclose(val1, val2)
Ejemplo n.º 32
0
def benchmark(n_imgs, n_channels, img_shape, n_filters, filter_shape, pad):
    print('\nn_imgs: %i, n_channels: %i, img_shape: (%i, %i), '
          % ((n_imgs, n_channels) + img_shape)
          + 'n_filters: %i, filter_shape: (%i, %i), pad: %i'
          % ((n_filters,) + filter_shape + (pad,)))

    # Setup arrays
    img_h, img_w = img_shape
    filter_h, filter_w = filter_shape
    convout_h = img_h + 2*pad - filter_h + 1
    convout_w = img_w + 2*pad - filter_w + 1

    imgs_bc01_shape = (n_imgs, n_channels, img_h, img_w)
    filters_bc01_shape = (n_filters, n_channels, filter_h, filter_w)

    imgs_bc01 = np.random.randn(n_imgs, n_channels, img_h, img_w)
    imgs_c01b = np.transpose(imgs_bc01, (1, 2, 3, 0))
    filters_fc01 = np.random.randn(n_filters, n_channels, filter_h, filter_w)
    filters_c01f = np.transpose(filters_fc01, (1, 2, 3, 0))
    convout_bc01 = np.random.randn(n_imgs, n_filters, convout_h, convout_w)
    convout_c01b = np.transpose(convout_bc01, (1, 2, 3, 0))

    imgs_bc01_t = theano.shared(imgs_bc01.astype(theano.config.floatX))
    imgs_c01b_t = theano.shared(imgs_c01b.astype(theano.config.floatX))
    filters_fc01_t = theano.shared(filters_fc01.astype(theano.config.floatX))
    filters_c01f_t = theano.shared(filters_c01f.astype(theano.config.floatX))
    convout_bc01_t = theano.shared(convout_bc01.astype(theano.config.floatX))
    convout_c01b_t = theano.shared(convout_c01b.astype(theano.config.floatX))

    # Forward propagation
    print('fprop')
    convout_cc_op = FilterActs(stride=1, partial_sum=4, pad=pad)
    convout_cc_expr = convout_cc_op(imgs_c01b_t, filters_c01f_t)
    convout_cc_fun = theano.function([], convout_cc_expr)
    convout_cc = convout_cc_fun()
    convout_cc = np.transpose(convout_cc, (3, 0, 1, 2))

    convout_fft_op = ConvBC01(n_imgs, n_channels, n_filters, img_shape,
                              filter_shape, (pad, pad))
    convout_fft_expr = convout_fft_op(imgs_bc01_t, filters_fc01_t)
    convout_fft_fun = theano.function([], host_from_gpu(convout_fft_expr))
    convout_fft = convout_fft_fun()
    print('         correct: ' + str(allclose(convout_fft, convout_cc)))
    duration_cc = avg_running_time(convout_cc_fun)
    convout_fft_fun = theano.function([], convout_fft_expr)
    duration_fft = avg_running_time(convout_fft_fun)
    print('   avg. duration: cuda_convnet: %.4f  fft: %.4f'
          % (duration_cc, duration_fft))
    print('         speedup: %.2f' % (duration_cc/duration_fft))
    del convout_fft_op
    del convout_fft_expr
    del convout_fft_fun
    del convout_cc_op
    del convout_cc_expr
    del convout_cc_fun

    # Back propagation, imgs
    print('bprop_imgs')
    dimgs_cc_op = ImageActs(stride=1, partial_sum=1, pad=pad)
    dimgs_cc_expr = dimgs_cc_op(convout_c01b_t, filters_c01f_t)
    dimgs_cc_fun = theano.function([], dimgs_cc_expr)
    dimgs_cc = dimgs_cc_fun()
    dimgs_cc = np.transpose(dimgs_cc, (3, 0, 1, 2))

    dimgs_fft_op = ConvBC01ImgsGrad(n_imgs, n_channels, n_filters, img_shape,
                                    filter_shape, (pad, pad))
    dimgs_fft_expr = dimgs_fft_op(filters_fc01_t, convout_bc01_t)
    dimgs_fft_fun = theano.function([], host_from_gpu(dimgs_fft_expr))
    dimgs_fft = dimgs_fft_fun()
    print('         correct: ' + str(allclose(dimgs_fft, dimgs_cc)))
    duration_cc = avg_running_time(dimgs_cc_fun)
    dimgs_fft_fun = theano.function([], dimgs_fft_expr)
    duration_fft = avg_running_time(dimgs_fft_fun)
    print('   avg. duration: cuda_convnet: %.4f  fft: %.4f'
          % (duration_cc, duration_fft))
    print('         speedup: %.2f' % (duration_cc/duration_fft))
    del dimgs_fft_op
    del dimgs_fft_expr
    del dimgs_fft_fun
    del dimgs_cc_op
    del dimgs_cc_expr
    del dimgs_cc_fun

    # Back propagation, filters
    dfilters_cc_op = WeightActs(stride=1, partial_sum=1, pad=pad)
    dfilters_cc_expr = dfilters_cc_op(imgs_c01b_t, convout_c01b_t,
                                      T.as_tensor_variable(filter_shape))
    dfilters_cc_fun = theano.function([], dfilters_cc_expr)
    dfilters_cc = dfilters_cc_fun()[0]
    dfilters_cc = np.transpose(dfilters_cc, (3, 0, 1, 2))

    dfilters_fft_op = ConvBC01FiltersGrad(n_imgs, n_channels, n_filters,
                                          img_shape, filter_shape, (pad, pad))
    dfilters_fft_expr = dfilters_fft_op(imgs_bc01_t, convout_bc01_t)
    dfilters_fft_fun = theano.function([], host_from_gpu(dfilters_fft_expr))
    dfilters_fft = dfilters_fft_fun()
    print('bprop_filters')
    print('         correct: ' + str(allclose(dfilters_fft, dfilters_cc)))
    duration_cc = avg_running_time(dfilters_cc_fun)
    dfilters_fft_fun = theano.function([], dfilters_fft_expr)
    duration_fft = avg_running_time(dfilters_fft_fun)
    print('   avg. duration: cuda_convnet: %.4f  fft: %.4f'
          % (duration_cc, duration_fft))
    print('         speedup: %.2f' % (duration_cc/duration_fft))
Ejemplo n.º 33
0
            lib.dot(input1_nervana,
                    input2_nervana,
                    output_nervana,
                    alpha=1,
                    beta=0,
                    relu=self.relu)

        thunk.inputs = inputs
        thunk.outputs = outputs
        thunk.lazy = False

        return thunk


nervana_dot = NervanaDot()

if __name__ == "__main__":
    import theano.tensor as T

    x = theano.shared(np.random.randn(2000, 3000).astype(theano.config.floatX))
    y = theano.shared(np.random.randn(3000, 1000).astype(theano.config.floatX))

    prod1 = T.dot(x, y)
    prod2 = host_from_gpu(nervana_dot(x, y))

    val1 = prod1.eval()
    val2 = prod2.eval()

    assert np.allclose(val1, val2)
Ejemplo n.º 34
0
    #m = N
    #n = N
    #k = N
    #m = 784
    #n = 512
    #k = 10
    m = 10000
    n = 4096
    k = 10

    print(m, n, k)

    A = T.fmatrix()
    B = T.fmatrix()
    dot1 = theano.function([A, B], T.dot(A, B))
    dot2 = theano.function([A, B], host_from_gpu(gemm(A, B)))
    dot3 = theano.function([A, B], host_from_gpu(magma_gemm(A, B)))
    dot4 = theano.function([A, B], host_from_gpu(xnor_gemm(A, B)))
    dot5 = theano.function([A, B], host_from_gpu(magma_mod_gemm(A, B)))

    # Generating random BINARY matrices
    a = SignNumpy(np.random.randn(m, n))
    b = SignNumpy(np.random.randn(n, k))
    # a = np.float32(np.random.randn(m, n))
    # b = np.float32(np.random.randn(n, k))

    start_time = time.time()
    c1 = dot1(a, b)
    dot1_duration = time.time() - start_time
    # print c1[0][0]
    print("Theano time = " + str(dot1_duration) + "s")
Ejemplo n.º 35
0
# Test suite
if __name__ == "__main__":
    # N = 8192
    N = 4096
    m = N
    n = N
    k = N
    # m = 784
    # n = 512
    # k = 10

    A = T.fmatrix()
    B = T.fmatrix()
    dot1 = theano.function([A, B], T.dot(A, B))
    dot2 = theano.function([A, B], host_from_gpu(gemm(A, B)))
    dot3 = theano.function([A, B], host_from_gpu(xnor_gemm(A, B)))

    # Generating random BINARY matrices
    a = SignNumpy(np.random.randn(m, n))
    b = SignNumpy(np.random.randn(n, k))
    # a = np.float32(np.random.randn(m, n))
    # b = np.float32(np.random.randn(n, k))

    start_time = time.time()
    c1 = dot1(a, b)
    dot1_duration = time.time() - start_time
    # print c1[0][0]
    print("Theano time = " + str(dot1_duration) + "s")

    start_time = time.time()
#        return vector_times_vector_grad(x,y,gz)
        
        
vector_times_vector=VectorTimesVector()




import numpy
from theano import tensor
import scipy
from scipy import io
a=tensor.vector('a',dtype='float32')
b=tensor.vector('b',dtype='float32')
c=vector_times_vector(a,b)
f=theano.function([a,b],host_from_gpu(c))

#ga,gb=theano.grad(c.sum(),[a,b])
#g=theano.function([a,b],[ga,gb])


x=numpy.random.randn(1000).astype('float32')
y=numpy.random.randn(1000).astype('float32')
z=f(x,y)
print 'x'
print x
print 'y'
print y
print 'z'
print z
Ejemplo n.º 37
0
    
# Test suite
if __name__ == "__main__":   
    # N = 8192
    N = 4096
    m = N
    n = N
    k = N
    # m = 784
    # n = 512 
    # k = 10
    
    A = T.fmatrix()
    B = T.fmatrix()
    dot1 = theano.function([A,B], T.dot(A, B))
    dot2 = theano.function([A,B], host_from_gpu(gemm(A, B)))
    dot3 = theano.function([A,B], host_from_gpu(xnor_gemm(A,B)))
    
    # Generating random BINARY matrices
    a = SignNumpy(np.random.randn(m, n))
    b = SignNumpy(np.random.randn(n, k))
    # a = np.float32(np.random.randn(m, n))
    # b = np.float32(np.random.randn(n, k))

    start_time = time.time()
    c1 = dot1(a,b)
    dot1_duration = time.time() - start_time
    # print c1[0][0]
    print("Theano time = "+str(dot1_duration)+"s")
    
    start_time = time.time()