Ejemplo n.º 1
0
def cpu_var_to_gpu_var(x):
    from theano.sandbox import cuda
    type = cuda.CudaNdarrayType(broadcastable=x.broadcastable)
    name = 'gpu_%s' % x.name
    name = None
    gpu_var = cuda.CudaNdarrayVariable(type=type, name=name)
    cpu_var = cuda.host_from_gpu(gpu_var)
    return gpu_var, cpu_var
    return cuda.host_from_gpu(cuda.CudaNdarrayVariable(type=type, name=name))
Ejemplo n.º 2
0
def cpu_var_to_gpu_var(x):
    from theano.sandbox import cuda
    type = cuda.CudaNdarrayType(broadcastable=x.broadcastable)
    name = 'gpu_%s'%x.name
    name = None
    gpu_var = cuda.CudaNdarrayVariable(type=type, name=name)
    cpu_var = cuda.host_from_gpu(gpu_var)
    return gpu_var, cpu_var
    return cuda.host_from_gpu(cuda.CudaNdarrayVariable(type=type, name=name))
def test_weight_acts_strided():

    # Tests that WeightActs with all possible strides 

    rng = np.random.RandomState([2012,10,9])

    #Each list in shape_list : 
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [[(1, 7, 8, 5),     (1, 2, 2, 16)],
                  [(3, 7, 8, 5),     (3, 3, 3, 16)],
                  [(16, 11, 11, 4),  (16, 4, 4, 16)], 
                  [(3, 20, 20, 3),   (3, 5, 5, 16)],
                  [(3, 21, 21, 3),   (3, 6, 6, 16)],
                  ]
    for partial_sum in [0, 1, 4]:
        print("partial_sum: %d"%(partial_sum))
        for test_idx in xrange(len(shape_list)):
            images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32')
            filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32')
            gpu_images = float32_shared_constructor(images,name='images')
            print("test case %d..."%(test_idx+1))
              
            for ii in xrange(filters.shape[1]):
                stride = ii + 1                            
                output_python = FilterActs_python(images,filters,stride)   
                _, h_rows, h_cols, _ = output_python.shape
                if partial_sum == 4:
                    if (h_rows*h_cols)%partial_sum != 0:
                        print("skip test case %d, stride %d when partial_sum is equal to %d"%(test_idx+1,stride,partial_sum))
                        break
                hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32')
                gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts')
                    
                weights_grad_python = WeightActs_python(images,hidacts,filters.shape[1],filters.shape[2],stride)
                
                weights_grad = WeightActs(partial_sum=partial_sum,stride=stride)(
                                                    gpu_images,
                                                    gpu_hidacts,
                                                    as_tensor_variable((filters.shape[1], filters.shape[2]))
                                                   )[0]
                weights_grad = host_from_gpu(weights_grad)
                f = function([], weights_grad)
                weights_grad_val = f()   
                
                warnings.warn("""test_weight_acts_strided success criterion is not very strict.""")
                
                if np.abs(weights_grad_val - weights_grad_python).max() > 3.4e-5:
                    assert type(weights_grad_val) == type(weights_grad_python)
                    assert weights_grad_val.dtype == weights_grad_python.dtype
                    if weights_grad_val.shape != weights_grad_python.shape:
                        print('cuda-convnet shape: ',weights_grad_val.shape)
                        print('python conv shape: ',weights_grad_python.shape)
                        assert False
                    err = np.abs(weights_grad_val - weights_grad_python)
                    print('stride %d'%stride)
                    print('absolute error range: ', (err.min(), err.max()))
                    print('mean absolute error: ', err.mean())
                    print('cuda-convnet value range: ', (weights_grad_val.min(), weights_grad_val.max()))
                    print('python conv value range: ', (weights_grad_python.min(), weights_grad_python.max()))
Ejemplo n.º 4
0
def insert_gpu_weight_acts(node):
    """
    .. todo::

        WRITEME
    """
    if isinstance(node.op, WeightActs):
        """
        .. todo::

            WRITEME
        """
        images, hidacts, frows, fcols = node.inputs
        if any_from_gpu(images, hidacts) or any_gpu_client(*node.outputs):
            gpu_weight_acts = GpuWeightActs(
                module_stride=node.op.module_stride, partial_sum=1)
            return [
                host_from_gpu(
                    gpu_weight_acts(
                        gpu_from_host(images),
                        gpu_from_host(hidacts),
                        frows,
                        fcols,
                    ))
            ]
Ejemplo n.º 5
0
 def local_to_gpu(node):
     """
     op(host_from_gpu()) -> host_from_gpu(op)
     gpu_from_host(op) -> op(gpu_from_host)
     """
     if isinstance(node.op, op):
         #op(host_from_gpu()) -> host_from_gpu(op)
         #If any of the input that go on the GPU are on the GPU,
         #move the op to the gpu.
         if any(node.inputs[idx].owner and
                isinstance(node.inputs[idx].owner.op, cuda.HostFromGpu)
                for idx in to_gpu):
             new_inp = list(node.inputs)
             for idx in to_gpu:
                 new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
             return [cuda.host_from_gpu(op()(*new_inp))]
     if node.op == cuda.gpu_from_host:
         #gpu_from_host(op) -> op(gpu_from_host)
         host_input = node.inputs[0]
         if host_input.owner and isinstance(host_input.owner.op,
                                            op):
             op_node = host_input.owner
             new_inp = list(op_node.inputs)
             for idx in to_gpu:
                 new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
             return [op()(*new_inp)]
     return False
Ejemplo n.º 6
0
def test_weight_acts_strided():

    # Tests that WeightActs with all possible strides 

    rng = np.random.RandomState([2012,10,9])

    #Each list in shape_list : 
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [[(1, 7, 8, 5),     (1, 2, 2, 16)],
                  [(3, 7, 8, 5),     (3, 3, 3, 16)],
                  [(16, 11, 11, 4),  (16, 4, 4, 16)], 
                  [(3, 20, 20, 3),   (3, 5, 5, 16)],
                  [(3, 21, 21, 3),   (3, 6, 6, 16)],
                  ]
    for partial_sum in [0, 1, 4]:
        print "partial_sum: %d"%(partial_sum)
        for test_idx in xrange(len(shape_list)):
            images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32')
            filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32')
            gpu_images = float32_shared_constructor(images,name='images')
            print "test case %d..."%(test_idx+1) 
              
            for ii in xrange(filters.shape[1]):
                stride = ii + 1                            
                output_python = FilterActs_python(images,filters,stride)   
                _, h_rows, h_cols, _ = output_python.shape
                if partial_sum == 4:
                    if (h_rows*h_cols)%partial_sum != 0:
                        print "skip test case %d, stride %d when partial_sum is equal to %d"%(test_idx+1,stride,partial_sum)
                        break
                hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32')
                gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts')
                    
                weights_grad_python = WeightActs_python(images,hidacts,filters.shape[1],filters.shape[2],stride)
                
                weights_grad = WeightActs(partial_sum=partial_sum,stride=stride)(
                                                    gpu_images,
                                                    gpu_hidacts,
                                                    as_tensor_variable((filters.shape[1], filters.shape[2]))
                                                   )[0]
                weights_grad = host_from_gpu(weights_grad)
                f = function([], weights_grad)
                weights_grad_val = f()   
                
                warnings.warn("""test_weight_acts_strided success criterion is not very strict.""")
                
                if np.abs(weights_grad_val - weights_grad_python).max() > 3.4e-5:
                    assert type(weights_grad_val) == type(weights_grad_python)
                    assert weights_grad_val.dtype == weights_grad_python.dtype
                    if weights_grad_val.shape != weights_grad_python.shape:
                        print 'cuda-convnet shape: ',weights_grad_val.shape
                        print 'python conv shape: ',weights_grad_python.shape
                        assert False
                    err = np.abs(weights_grad_val - weights_grad_python)
                    print 'stride %d'%stride
                    print 'absolute error range: ', (err.min(), err.max())
                    print 'mean absolute error: ', err.mean()
                    print 'cuda-convnet value range: ', (weights_grad_val.min(), weights_grad_val.max())
                    print 'python conv value range: ', (weights_grad_python.min(), weights_grad_python.max())
Ejemplo n.º 7
0
    def lmul(self, x):
        """
        dot(x, A)
        aka, do convolution with input image x

        """

        check_cuda(str(type(self)) + ".lmul")
        # TODO Why is it CPU??
        print "Por que?!?!", type(x)
        cpu = "Cuda" not in str(type(x))
        if cpu:
            x = gpu_from_host(x)

        assert x.ndim == 5
        x_axes = self.input_axes
        assert len(x_axes) == 5

        op_axes = ("c", 0, 1, "t", "b")
        if tuple(x_axes) != op_axes:
            print "ssssssssssssssss"
            x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes])

        _x_4d_shape = (
            self.signal_shape[0],
            self.signal_shape[1],
            self.signal_shape[2],
            self.signal_shape[3] * self.signal_shape[4],
        )

        x = x.reshape(_x_4d_shape)

        x = gpu_contiguous(x)

        rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])(x, self._filters)

        if cpu:
            rval = host_from_gpu(rval)

        rval = rval.reshape(
            (
                self.filter_shape[3],
                self.filter_shape[4],
                rval.shape[1],
                rval.shape[2],
                self.signal_shape[3],
                self.signal_shape[4],
            )
        )

        rval = diagonal_subtensor(rval, 4, 0).sum(axis=0)

        # Format the output based on the output space
        rval_axes = self.output_axes
        assert len(rval_axes) == 5

        if tuple(rval_axes) != op_axes:
            rval = rval.dimshuffle(*[op_axes.index(axis) for axis in rval_axes])

        return rval
Ejemplo n.º 8
0
    def local_gpu_togpu(node):

        if node.op == gpu_from_host:

            host_input = node.inputs[0]

            if host_input.owner and hasattr(host_input.owner.op,
                                            'make_gpu_node'):

                try:

                    gpu_inputs = list(
                        map(gpu_from_host, host_input.owner.inputs))

                except TypeError:

                    return False

                return [host_input.owner.op.make_gpu_node(*gpu_inputs)]

        elif hasattr(node.op, 'make_gpu_node') and all(
            [x.owner and x.owner.op == host_from_gpu for x in node.inputs]):

            gpu_inputs = [x.owner.inputs[0] for x in node.inputs]

            return [host_from_gpu(node.op.make_gpu_node(*gpu_inputs))]

        return False
Ejemplo n.º 9
0
def local_gpu_conv_transp3d(node):
    if isinstance(node.op, ConvTransp3D):
        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
                      for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                W, b, d, H, RShape = node.inputs
                return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
Ejemplo n.º 10
0
def test_match_valid_conv():

    # Tests that running FilterActs with no padding is the same as running
    # theano's conv2D in valid mode

    rng = np.random.RandomState([2012, 10, 9])

    batch_size = 5
    rows = 10
    cols = 9
    channels = 3
    filter_rows = 4
    filter_cols = filter_rows
    num_filters = 16

    images = shared(rng.uniform(
        -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                    name='images')
    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs()(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    images_bc01 = images.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid')

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    f = function([], [output, output_conv2d])

    output, output_conv2d = f()

    warnings.warn(
        """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others."""
    )
    if np.abs(output - output_conv2d).max() > 2.4e-6:
        assert type(output) == type(output_conv2d)
        assert output.dtype == output_conv2d.dtype
        if output.shape != output_conv2d.shape:
            print 'cuda-convnet shape: ', output.shape
            print 'theano shape: ', output_conv2d.shape
            assert False
        err = np.abs(output - output_conv2d)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (output.min(), output.max())
        print 'theano value range: ', (output_conv2d.min(),
                                       output_conv2d.max())
        assert False
Ejemplo n.º 11
0
    def local_to_gpu(node):
        """
        op(host_from_gpu()) -> host_from_gpu(op)
        gpu_from_host(op) -> op(gpu_from_host)

        """
        if isinstance(node.op, op):
            # op(host_from_gpu()) -> host_from_gpu(op)
            # If any of the input that go on the GPU are on the GPU,
            # move the op to the gpu.
            if any(node.inputs[idx].owner
                   and isinstance(node.inputs[idx].owner.op, cuda.HostFromGpu)
                   for idx in to_gpu):
                new_inp = list(node.inputs)
                for idx in to_gpu:
                    new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
                return [cuda.host_from_gpu(op()(*new_inp))]
        if node.op == cuda.gpu_from_host:
            # gpu_from_host(op) -> op(gpu_from_host)
            host_input = node.inputs[0]
            if host_input.owner and isinstance(host_input.owner.op, op):
                op_node = host_input.owner
                new_inp = list(op_node.inputs)
                for idx in to_gpu:
                    new_inp[idx] = cuda.gpu_from_host(new_inp[idx])
                return [op()(*new_inp)]
        return False
Ejemplo n.º 12
0
        def optimize(node):
            if isinstance(node.op, cuda.GpuFromHost):
                # gpu_from_host(cpu_op) -> gpu_op(gpu_from_host)
                host_input = node.inputs[0]

                if host_input.owner and isinstance(host_input.owner.op, CpuOpCls):
                    cpu_op = host_input.owner.op
                    args = dict(zip(cpu_op.__props__, cpu_op._props()))
                    gpu_op = GpuOpCls(**args)
                    inputs = host_input.owner.inputs
                    out = gpu_op(*inputs)
                    return [out]

            if isinstance(node.op, CpuOpCls):
                # cpu_op(host_from_gpu) -> host_from_gpu(gpu_op)
                def _is_variable_on_gpu(var):
                    return var.owner and isinstance(var.owner.op, cuda.HostFromGpu)
                inputs = node.inputs
                inputs_on_gpu = map(_is_variable_on_gpu, inputs)

                if any(inputs_on_gpu):
                    cpu_op = node.op
                    args = dict(zip(cpu_op.__props__, cpu_op._props()))
                    gpu_op = GpuOpCls(**args)
                    out = gpu_op(*inputs)
                    out = cuda.host_from_gpu(out)
                    return [out]

            return False
Ejemplo n.º 13
0
def local_gpu_conv_transp3d(node):
    if isinstance(node.op, ConvTransp3D):
        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
                      for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                W, b, d, H, RShape = node.inputs
                return [host_from_gpu(gpu_conv_transpd(W, b, d, H, RShape))]
Ejemplo n.º 14
0
def local_gpu_Contiguous(node):
    if isinstance(node.op, Contiguous):
        # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py
        from theano.sandbox.cuda import host_from_gpu
        x, = node.inputs
        if x.owner and x.owner.op == host_from_gpu:
            from theano.sandbox.cuda.basic_ops import gpu_contiguous
            return [host_from_gpu(gpu_contiguous(x.owner.inputs[0]))]
Ejemplo n.º 15
0
def local_gpu_Contiguous(node):
  if isinstance(node.op, Contiguous):
    # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py
    from theano.sandbox.cuda import host_from_gpu
    x, = node.inputs
    if x.owner and x.owner.op == host_from_gpu:
      from theano.sandbox.cuda.basic_ops import gpu_contiguous
      return [host_from_gpu(gpu_contiguous(x.owner.inputs[0]))]
Ejemplo n.º 16
0
def test_attention_time_gauss():
    n_T = 4
    n_batch = 2
    n_inp_dim = 3
    n_cells = 5
    n_B = 5

    custom_op = get_attention(RecurrentTransform.AttentionTimeGauss,
                              n_out=n_cells,
                              n_batches=n_batch,
                              n_input_t=n_B,
                              n_input_dim=n_inp_dim)
    att = custom_op.recurrent_transform

    Z_val = numpy.random.ranf((n_T, n_batch, 4 * n_cells)).astype('float32')
    W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
    W_att_quadr_val = numpy.eye(n_B).astype('float32')
    W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
    B_val = numpy.random.ranf((n_B, n_batch, n_cells)).astype('float32')
    c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
    y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
    i_val = numpy.ones((n_T, n_batch), dtype='int8')

    Z = T.ftensor3('Z')
    B = T.ftensor3('B')  #base
    W_re = T.fmatrix('W_re')
    W_att_quadr = T.fmatrix("W_att_quadr")
    W_att_in = T.fmatrix('W_att_in')
    c = T.fmatrix('c')  #initial state
    y0 = T.fmatrix('y0')  #initial activation
    i = T.matrix('i', dtype='int8')
    t0 = T.fvector('t0')
    custom_vars = att.get_sorted_custom_vars()
    initial_state_vars = att.get_sorted_state_vars_initial()
    custom_op_inputs = [Z, c, y0, i, W_re] + custom_vars + initial_state_vars
    print "input args num:", len(custom_op_inputs)
    print "input args:", custom_op_inputs
    custom_op_outputs = custom_op(*custom_op_inputs)
    print "output args num:", len(custom_op_outputs)
    custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs]
    f = theano.function(inputs=[Z, c, y0, i, W_re], outputs=custom_op_outputs)

    res = f(Z_val, c_val, y0_val, i_val, W_re_val)

    #print res
    # res: (output) Y, (gates and cell state) H, (final cell state) d, state vars sequences
    (Y, H, d), state_var_seqs = res[:3], res[3:]

    # print "running custom dumped data"
    # custom_op_inputs = [theano.shared(numpy.load("../op.i.%i" % i)) for i in range(12)]
    # custom_op_outputs = custom_op(*custom_op_inputs)
    # custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs]
    # f = theano.function(inputs=[], outputs=custom_op_outputs)
    # res = f()

    print res

    assert False
Ejemplo n.º 17
0
def local_gpu_TorchWrapper(node):
  if isinstance(node.op, TorchWrapperOp):
    from theano.sandbox.cuda import host_from_gpu, gpu_from_host
    args = node.inputs
    if any([(x.owner and x.owner.op == host_from_gpu) for x in args]):
      gpu_op = GpuTorchWrapperOp(**{key: getattr(node.op, key) for key in node.op.__props__})
      args = [x.owner.inputs[0] if (x.owner and x.owner.op == host_from_gpu) else x
              for x in args]
      return [host_from_gpu(gpu_op(*args))]
Ejemplo n.º 18
0
def test_grad():

    rng = np.random.RandomState([2012, 10, 9])

    batch_size = 5
    rows = 10
    cols = 9
    channels = 3
    filter_rows = 4
    filter_cols = filter_rows
    num_filters = 16

    images = shared(rng.uniform(-1.0, 1.0, (channels, rows, cols, batch_size)).astype("float32"), name="images")
    filters = shared(
        rng.uniform(-1.0, 1.0, (channels, filter_rows, filter_cols, num_filters)).astype("float32"), name="filters"
    )

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs()(gpu_images, gpu_filters)
    output = host_from_gpu(output)
    # XXX: use verify_grad
    output_grad = grad(output.sum(), images)

    images_bc01 = images.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode="valid")

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)
    # XXX: use verify_grad
    output_conv2d_grad = grad(output_conv2d.sum(), images)
    f = function([], [output_grad, output_conv2d_grad])

    output_grad, output_conv2d_grad = f()

    warnings.warn(
        """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others."""
    )
    if np.abs(output_grad - output_conv2d_grad).max() > 7.7e-6:
        assert type(output_grad) == type(output_conv2d_grad)
        assert output_grad.dtype == output_conv2d_grad.dtype
        if output_grad.shape != output_conv2d_grad.shape:
            print "cuda-convnet shape: ", output_grad.shape
            print "theano shape: ", output_conv2d_grad.shape
            assert False
        err = np.abs(output_grad - output_conv2d_grad)
        print "absolute error range: ", (err.min(), err.max())
        print "mean absolute error: ", err.mean()
        print "cuda-convnet value range: ", (output_grad.min(), output_grad.max())
        print "theano value range: ", (output_conv2d_grad.min(), output_conv2d_grad.max())
        assert False
Ejemplo n.º 19
0
def test_match_valid_conv():

    # Tests that running FilterActs with no padding is the same as running
    # theano's conv2D in valid mode

    rng = np.random.RandomState([2012,10,9])

    batch_size = 5
    rows = 10
    cols = 9
    channels = 3
    filter_rows = 4
    filter_cols = filter_rows
    num_filters = 16

    images = shared(rng.uniform(-1., 1., (channels, rows, cols,
        batch_size)).astype('float32'), name='images')
    filters = shared(rng.uniform(-1., 1., (channels, filter_rows,
        filter_cols, num_filters)).astype('float32'), name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs()(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    images_bc01 = images.dimshuffle(3,0,1,2)
    filters_bc01 = filters.dimshuffle(3,0,1,2)
    filters_bc01 = filters_bc01[:,:,::-1,::-1]

    output_conv2d = conv2d(images_bc01, filters_bc01,
            border_mode='valid')

    output_conv2d = output_conv2d.dimshuffle(1,2,3,0)

    try:
        f = function([], [output, output_conv2d])
    except:
        raise KnownFailureTest("cuda-convnet code depends on an unmerged theano feature.")

    output, output_conv2d = f()

    warnings.warn("test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?")
    if np.abs(output - output_conv2d).max() > 2.4e-6:
        assert type(output) == type(output_conv2d)
        assert output.dtype == output_conv2d.dtype
        if output.shape != output_conv2d.shape:
            print 'cuda-convnet shape: ',output.shape
            print 'theano shape: ',output_conv2d.shape
            assert False
        err = np.abs(output - output_conv2d)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (output.min(), output.max())
        print 'theano value range: ', (output_conv2d.min(), output_conv2d.max())
        assert False
Ejemplo n.º 20
0
def test_match_valid_conv_strided():

    # Tests that running FilterActs with stride is the same as running
    # theano's conv2D in valid mode and then downsampling

    rng = np.random.RandomState([2012,10,9])

    batch_size = 5
    rows = 9
    cols = 9
    channels = 3
    filter_rows = 3
    filter_cols = filter_rows
    stride = 3
    num_filters = 16

    images = shared(rng.uniform(-1., 1., (channels, rows, cols,
        batch_size)).astype('float32'), name='images')
    filters = shared(rng.uniform(-1., 1., (channels, filter_rows,
        filter_cols, num_filters)).astype('float32'), name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs(stride=stride)(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    images_bc01 = images.dimshuffle(3,0,1,2)
    filters_bc01 = filters.dimshuffle(3,0,1,2)
    filters_bc01 = filters_bc01[:,:,::-1,::-1]

    output_conv2d = conv2d(images_bc01, filters_bc01,
            border_mode='valid', subsample=(stride, stride))

    output_conv2d_orig = output_conv2d.dimshuffle(1,2,3,0)
    output_conv2d = output_conv2d_orig  # [:, ::stride, ::stride, :]
    f = function([], [output, output_conv2d, output_conv2d_orig])

    output, output_conv2d, output_conv2d_orig = f()

    warnings.warn("""test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others.""")
    if np.abs(output - output_conv2d).max() > 2.4e-6:
        assert type(output) == type(output_conv2d)
        assert output.dtype == output_conv2d.dtype
        if output.shape != output_conv2d.shape:
            print 'cuda-convnet shape: ',output.shape
            print 'theano shape: ',output_conv2d.shape
            assert False
        err = np.abs(output - output_conv2d)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (output.min(), output.max())
        print 'theano value range: ', (output_conv2d.min(), output_conv2d.max())
        assert False
Ejemplo n.º 21
0
def insert_gpu_filter_acts(node):
    if isinstance(node.op, FilterActs):
        images, filters = node.inputs
        if any_from_gpu(images, filters) or any_gpu_client(*node.outputs):
            gpu_filter_acts = GpuFilterActs(
                    module_stride=node.op.module_stride,
                    partial_sum=1)
            return [host_from_gpu(gpu_filter_acts(
                gpu_from_host(images),
                gpu_from_host(filters)))]
Ejemplo n.º 22
0
 def local_gpu_togpu_breakpoint(node):
     if isinstance(node.op, Breakpoint):
         result_input = node.inputs[0]
         if result_input.owner and result_input.owner.op == host_from_gpu:
             gpu_inputs = [x.owner.inputs[0]
                             if x.owner and x.owner.op == host_from_gpu
                             else x
                           for x in node.inputs]
             return [host_from_gpu(node.op.make_gpu_node(*gpu_inputs))]
     return False
Ejemplo n.º 23
0
def test_attention_time_gauss():
  n_T = 4
  n_batch = 2
  n_inp_dim = 3
  n_cells = 5
  n_B = 5

  custom_op = get_attention(RecurrentTransform.AttentionTimeGauss,
                            n_out=n_cells, n_batches=n_batch, n_input_t=n_B, n_input_dim=n_inp_dim)
  att = custom_op.recurrent_transform

  Z_val = numpy.random.ranf((n_T,n_batch,4*n_cells)).astype('float32')
  W_re_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  W_att_quadr_val = numpy.eye(n_B).astype('float32')
  W_att_in_val = numpy.random.ranf((n_cells, 4 * n_cells)).astype('float32')
  B_val = numpy.random.ranf((n_B,n_batch,n_cells)).astype('float32')
  c_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  y0_val = numpy.random.ranf((n_batch, n_cells)).astype('float32')
  i_val = numpy.ones((n_T, n_batch), dtype='int8')

  Z = T.ftensor3('Z')
  B = T.ftensor3('B') #base
  W_re = T.fmatrix('W_re')
  W_att_quadr = T.fmatrix("W_att_quadr")
  W_att_in = T.fmatrix('W_att_in')
  c = T.fmatrix('c') #initial state
  y0 = T.fmatrix('y0') #initial activation
  i = T.matrix('i',dtype='int8')
  t0 = T.fvector('t0')
  custom_vars = att.get_sorted_custom_vars()
  initial_state_vars = att.get_sorted_state_vars_initial()
  custom_op_inputs = [Z, c, y0, i, W_re] + custom_vars + initial_state_vars
  print("input args num:", len(custom_op_inputs))
  print("input args:", custom_op_inputs)
  custom_op_outputs = custom_op(*custom_op_inputs)
  print("output args num:", len(custom_op_outputs))
  custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs]
  f = theano.function(inputs=[Z, c, y0, i, W_re], outputs=custom_op_outputs)

  res = f(Z_val, c_val, y0_val, i_val, W_re_val)

  #print res
  # res: (output) Y, (gates and cell state) H, (final cell state) d, state vars sequences
  (Y, H, d), state_var_seqs = res[:3], res[3:]

  # print "running custom dumped data"
  # custom_op_inputs = [theano.shared(numpy.load("../op.i.%i" % i)) for i in range(12)]
  # custom_op_outputs = custom_op(*custom_op_inputs)
  # custom_op_outputs = [cuda.host_from_gpu(v) for v in custom_op_outputs]
  # f = theano.function(inputs=[], outputs=custom_op_outputs)
  # res = f()

  print(res)

  assert False
Ejemplo n.º 24
0
def local_gpu_conv_grad3d(node):
    if isinstance(node.op, ConvGrad3D):
        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
                      for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                V, d, WShape, dCdH = node.inputs
                return [host_from_gpu(gpu_conv_grad3d(
                    as_cuda_ndarray_variable(V),
                    d,
                    WShape,
                    as_cuda_ndarray_variable(dCdH)))]
Ejemplo n.º 25
0
def local_gpu_conv_grad3d(node):
    if isinstance(node.op, ConvGrad3D):
        if numpy.any([i.owner and isinstance(i.owner.op, HostFromGpu)
                      for i in node.inputs]):
            if numpy.all([o.type.dtype == 'float32' for o in node.outputs]):
                V, d, WShape, dCdH = node.inputs
                return [host_from_gpu(gpu_conv_grad3d(
                    as_cuda_ndarray_variable(V),
                    d,
                    WShape,
                    as_cuda_ndarray_variable(dCdH)))]
Ejemplo n.º 26
0
def test_viewop_gpu():
    from theano.sandbox import cuda
    if cuda.cuda_available == False:
        raise SkipTest('Optional package cuda disabled')
    _x = theano.tensor.fvector('x')
    x = cuda.gpu_from_host(_x)
    _out = theano.compile.ViewOp()(x)
    out = cuda.host_from_gpu(_out)
    f = theano.function([x], out, mode=mode_with_gpu)
    data = numpy.array([1, 2, 3], dtype='float32')
    assert numpy.allclose(f(data), data)
Ejemplo n.º 27
0
def insert_gpu_filter_acts(node):
    if isinstance(node.op, FilterActs):
        images, filters = node.inputs
        if any_from_gpu(images, filters) or any_gpu_client(*node.outputs):
            gpu_filter_acts = GpuFilterActs(
                module_stride=node.op.module_stride, partial_sum=1)
            return [
                host_from_gpu(
                    gpu_filter_acts(gpu_from_host(images),
                                    gpu_from_host(filters)))
            ]
Ejemplo n.º 28
0
def local_gpu_NativeOp(node):
  if isinstance(node.op, NativeOp):
    # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py
    from theano.sandbox.cuda import host_from_gpu, gpu_from_host, as_cuda_ndarray_variable
    args = node.inputs
    if any([(x.owner and x.owner.op == host_from_gpu) for x in args]):
      gpu_op = GpuNativeOp(**{key: getattr(node.op, key) for key in node.op.__props__})
      args = [x.owner.inputs[0] if (x.owner and x.owner.op == host_from_gpu) else x
              for x in args]
      from TheanoUtil import make_var_tuple
      outputs = make_var_tuple(gpu_op(*args))
      return [host_from_gpu(out) for out in outputs]
Ejemplo n.º 29
0
def test_viewop_gpu():
    from theano.sandbox import cuda

    if cuda.cuda_available == False:
        raise SkipTest("Optional package cuda disabled")
    _x = theano.tensor.fvector("x")
    x = cuda.gpu_from_host(_x)
    _out = theano.compile.ViewOp()(x)
    out = cuda.host_from_gpu(_out)
    f = theano.function([x], out, mode=mode_with_gpu)
    data = numpy.array([1, 2, 3], dtype="float32")
    assert numpy.allclose(f(data), data)
Ejemplo n.º 30
0
    def lmul(self, x):
        """
        dot(x, A)
        aka, do convolution with input image x

        """

        check_cuda(str(type(self)) + ".lmul")
        # TODO Why is it CPU??
        print 'Por que?!?!', type(x)
        cpu = 'Cuda' not in str(type(x))
        if cpu:
            x = gpu_from_host(x)

        assert x.ndim == 5
        x_axes = self.input_axes
        assert len(x_axes) == 5

        op_axes = ('c', 0, 1, 't', 'b')
        if tuple(x_axes) != op_axes:
            print 'ssssssssssssssss'
            x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes])

        _x_4d_shape = (self.signal_shape[0], self.signal_shape[1],
                       self.signal_shape[2],
                       self.signal_shape[3] * self.signal_shape[4])

        x = x.reshape(_x_4d_shape)

        x = gpu_contiguous(x)

        rval = FilterActs(self.pad, self.partial_sum,
                          self.kernel_stride[0])(x, self._filters)

        if cpu:
            rval = host_from_gpu(rval)

        rval = rval.reshape(
            (self.filter_shape[3], self.filter_shape[4], rval.shape[1],
             rval.shape[2], self.signal_shape[3], self.signal_shape[4]))

        rval = diagonal_subtensor(rval, 4, 0).sum(axis=0)

        # Format the output based on the output space
        rval_axes = self.output_axes
        assert len(rval_axes) == 5

        if tuple(rval_axes) != op_axes:
            rval = rval.dimshuffle(
                *[op_axes.index(axis) for axis in rval_axes])

        return rval
Ejemplo n.º 31
0
def insert_gpu_weight_acts(node):
    if isinstance(node.op, WeightActs):
        images, hidacts, frows, fcols = node.inputs
        if any_from_gpu(images, hidacts) or any_gpu_client(*node.outputs):
            gpu_weight_acts = GpuWeightActs(
                    module_stride=node.op.module_stride,
                    partial_sum=1)
            return [host_from_gpu(gpu_weight_acts(
                gpu_from_host(images),
                gpu_from_host(hidacts),
                frows,
                fcols,
                ))]
Ejemplo n.º 32
0
def insert_gpu_img_acts(node):
    if isinstance(node.op, ImgActs):
        filters, hidacts, irows, icols = node.inputs
        if any_from_gpu(filters, hidacts) or any_gpu_client(*node.outputs):
            gpu_img_acts = GpuImgActs(
                    module_stride=node.op.module_stride,
                    partial_sum=1)
            return [host_from_gpu(gpu_img_acts(
                gpu_from_host(filters),
                gpu_from_host(hidacts),
                irows,
                icols,
                ))]
Ejemplo n.º 33
0
def local_gpu_alloc_diagonal(node):
    if (isinstance(node.op, AllocDiag) and
        isinstance(node.inputs[0].type,
                   theano.tensor.TensorType)):
        inp = node.inputs[0]
        if inp.owner and isinstance(inp.owner.op, cuda.HostFromGpu):
            diag = inp.owner.inputs[0]
            y = cuda.gpu_from_host(tensor.alloc(numpy.asarray(0, dtype=diag.dtype), diag.shape[0], diag.shape[0]))
            y = theano.tensor.nnet.conv3d2d.IncDiagonalSubtensor()(y, 0, 1, diag)
            return [cuda.host_from_gpu(y)]
        else:
            return False
    return False
def test_image_acts_strided():

    # Tests that running FilterActs with all possible strides 

    rng = np.random.RandomState([2012,10,9])

    #Each list in shape_list : 
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [[(1, 7, 8, 5),     (1, 2, 2, 16)],
                  [(3, 7, 8, 5),     (3, 3, 3, 16)],
                  [(16, 11, 11, 4),  (16, 4, 4, 16)], 
                  [(3, 20, 20, 3),   (3, 5, 5, 16)],
                  [(3, 21, 21, 3),   (3, 6, 6, 16)],
                  ]

    for test_idx in xrange(len(shape_list)):
        images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32')
        filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32')
        gpu_images = float32_shared_constructor(images,name='images')
        gpu_filters = float32_shared_constructor(filters,name='filters')
        print("test case %d..."%(test_idx+1))
        
        for ii in xrange(filters.shape[1]):
            stride = ii + 1
                   
            output_python = FilterActs_python(images,filters,stride)
            hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32')
            gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts')
            Img_output_python = ImageActs_python(filters,hidacts,stride,(images.shape[1], images.shape[2]))            
            
            Img_output = ImageActs(stride=stride)(gpu_hidacts, gpu_filters, as_tensor_variable((images.shape[1], images.shape[2])))
            Img_output = host_from_gpu(Img_output)
            f = function([], Img_output)
            Img_output_val = f()
            
            warnings.warn("""test_image_acts_strided success criterion is not very strict.""")
            
            if np.abs(Img_output_val - Img_output_python).max() > 2.1e-5:
                assert type(Img_output_val) == type(Img_output_python)
                assert Img_output_val.dtype == Img_output_python.dtype
                if Img_output_val.shape != Img_output_python.shape:
                    print('cuda-convnet shape: ',Img_output_val.shape)
                    print('python conv shape: ',Img_output_python.shape)
                    assert False
                err = np.abs(Img_output_val - Img_output_python)
                print('stride %d'%stride)
                print('absolute error range: ', (err.min(), err.max()))
                print('mean absolute error: ', err.mean())
                print('cuda-convnet value range: ', (Img_output_val.min(), Img_output_val.max()))
                print('python conv value range: ', (Img_output_python.min(), Img_output_python.max()))
Ejemplo n.º 35
0
def test_filter_acts_strided():

    # Tests that FilterActs with all possible strides

    rng = np.random.RandomState([2012, 10, 9])

    #Each list in shape_list :
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [
        [(1, 7, 8, 5), (1, 2, 2, 16)],
        [(3, 7, 8, 5), (3, 3, 3, 16)],
        [(16, 11, 11, 4), (16, 4, 4, 16)],
        [(3, 20, 20, 3), (3, 5, 5, 16)],
        [(3, 21, 21, 3), (3, 6, 6, 16)],
    ]

    for test_idx in xrange(len(shape_list)):
        images = rng.uniform(-1., 1.,
                             shape_list[test_idx][0]).astype('float32')
        filters = rng.uniform(-1., 1.,
                              shape_list[test_idx][1]).astype('float32')
        gpu_images = float32_shared_constructor(images, name='images')
        gpu_filters = float32_shared_constructor(filters, name='filters')
        print("test case %d..." % (test_idx + 1))

        for ii in xrange(filters.shape[1]):
            stride = ii + 1

            output = FilterActs(stride=stride)(gpu_images, gpu_filters)
            output = host_from_gpu(output)
            f = function([], output)
            output_val = f()

            output_python = FilterActs_python(images, filters, stride)

            if np.abs(output_val - output_python).max() > 8.6e-6:
                assert type(output_val) == type(output_python)
                assert output_val.dtype == output_python.dtype
                if output_val.shape != output_python.shape:
                    print('cuda-convnet shape: ', output_val.shape)
                    print('python conv shape: ', output_python.shape)
                    assert False
                err = np.abs(output_val - output_python)
                print('stride %d' % stride)
                print('absolute error range: ', (err.min(), err.max()))
                print('mean absolute error: ', err.mean())
                print('cuda-convnet value range: ',
                      (output_val.min(), output_val.max()))
                print('python conv value range: ',
                      (output_python.min(), output_python.max()))
Ejemplo n.º 36
0
def test_image_acts_strided():

    # Tests that running FilterActs with all possible strides 

    rng = np.random.RandomState([2012,10,9])

    #Each list in shape_list : 
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [[(1, 7, 8, 5),     (1, 2, 2, 16)],
                  [(3, 7, 8, 5),     (3, 3, 3, 16)],
                  [(16, 11, 11, 4),  (16, 4, 4, 16)], 
                  [(3, 20, 20, 3),   (3, 5, 5, 16)],
                  [(3, 21, 21, 3),   (3, 6, 6, 16)],
                  ]

    for test_idx in xrange(len(shape_list)):
        images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32')
        filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32')
        gpu_images = float32_shared_constructor(images,name='images')
        gpu_filters = float32_shared_constructor(filters,name='filters')
        print "test case %d..."%(test_idx+1) 
        
        for ii in xrange(filters.shape[1]):
            stride = ii + 1
                   
            output_python = FilterActs_python(images,filters,stride)
            hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32')
            gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts')
            Img_output_python = ImageActs_python(filters,hidacts,stride,(images.shape[1], images.shape[2]))            
            
            Img_output = ImageActs(stride=stride)(gpu_hidacts, gpu_filters, as_tensor_variable((images.shape[1], images.shape[2])))
            Img_output = host_from_gpu(Img_output)
            f = function([], Img_output)
            Img_output_val = f()
            
            warnings.warn("""test_image_acts_strided success criterion is not very strict.""")
            
            if np.abs(Img_output_val - Img_output_python).max() > 2.1e-5:
                assert type(Img_output_val) == type(Img_output_python)
                assert Img_output_val.dtype == Img_output_python.dtype
                if Img_output_val.shape != Img_output_python.shape:
                    print 'cuda-convnet shape: ',Img_output_val.shape
                    print 'python conv shape: ',Img_output_python.shape
                    assert False
                err = np.abs(Img_output_val - Img_output_python)
                print 'stride %d'%stride
                print 'absolute error range: ', (err.min(), err.max())
                print 'mean absolute error: ', err.mean()
                print 'cuda-convnet value range: ', (Img_output_val.min(), Img_output_val.max())
                print 'python conv value range: ', (Img_output_python.min(), Img_output_python.max())    
Ejemplo n.º 37
0
    def lmul(self, x):
        """
        .. todo::

            WRITEME properly

        dot(x, A)
        aka, do convolution with input image x
        """

        check_cuda(str(type(self)) + ".lmul")

        cpu = 'Cuda' not in str(type(x))

        if cpu:
            x = gpu_from_host(x)

        # x must be formatted as channel, topo dim 0, topo dim 1, batch_index
        # for use with FilterActs
        assert x.ndim == 4
        x_axes = self.input_axes
        assert len(x_axes) == 4

        op_axes = ('c', 0, 1, 'b')

        if tuple(x_axes) != op_axes:
            x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes])

        x = gpu_contiguous(x)

        # Patch old pickle files.
        if not hasattr(self, 'kernel_stride'):
            self.kernel_stride = (1, 1)
        rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])(
            x,
            self._filters
        )

        # Format the output based on the output space
        rval_axes = self.output_axes
        assert len(rval_axes) == 4

        if cpu:
            rval = host_from_gpu(rval)

        if tuple(rval_axes) != op_axes:
            rval = rval.dimshuffle(*[op_axes.index(axis)
                                     for axis in rval_axes])

        return rval
Ejemplo n.º 38
0
    def lmul(self, x):
        """
        .. todo::

            WRITEME properly

        dot(x, A)
        aka, do convolution with input image x
        """

        check_cuda(str(type(self)) + ".lmul")

        cpu = 'Cuda' not in str(type(x))

        if cpu:
            x = gpu_from_host(x)

        # x must be formatted as channel, topo dim 0, topo dim 1, batch_index
        # for use with FilterActs
        assert x.ndim == 4
        x_axes = self.input_axes
        assert len(x_axes) == 4

        op_axes = ('c', 0, 1, 'b')

        if tuple(x_axes) != op_axes:
            x = x.dimshuffle(*[x_axes.index(axis) for axis in x_axes])

        x = gpu_contiguous(x)

        # Patch old pickle files.
        if not hasattr(self, 'kernel_stride'):
            self.kernel_stride = (1, 1)
        rval = FilterActs(self.pad, self.partial_sum, self.kernel_stride[0])(
            x,
            self._filters
        )

        # Format the output based on the output space
        rval_axes = self.output_axes
        assert len(rval_axes) == 4

        if cpu:
            rval = host_from_gpu(rval)

        if tuple(rval_axes) != op_axes:
            rval = rval.dimshuffle(*[op_axes.index(axis)
                                     for axis in rval_axes])

        return rval
Ejemplo n.º 39
0
def _local_gpu_native_op(node):
  if isinstance(node.op, TheanoNativeOp):
    # see also: https://github.com/Theano/Theano/blob/master/theano/sandbox/cuda/opt.py
    # noinspection PyUnresolvedReferences,PyPackageRequirements
    from theano.sandbox.cuda import host_from_gpu, gpu_from_host, as_cuda_ndarray_variable
    args = node.inputs
    if any([(x.owner and x.owner.op == host_from_gpu) for x in args]):
      gpu_op = TheanoGpuNativeOp(**{key: getattr(node.op, key) for key in node.op.__props__})
      args = [x.owner.inputs[0] if (x.owner and x.owner.op == host_from_gpu) else x
              for x in args]
      from returnn.theano.util import make_var_tuple
      # noinspection PyCallingNonCallable
      outputs = make_var_tuple(gpu_op(*args))
      return [host_from_gpu(out) for out in outputs]
Ejemplo n.º 40
0
 def grab_lr(v):
     if v.owner is not None:
         n = v.owner
         if isinstance(n.op, GpuDimShuffle) and n.op.new_order == ("x", "x", "x", "x"):
             return host_from_gpu(n.inputs[0])
         elif isinstance(n.op, DimShuffle) and n.op.new_order == ("x", "x", "x", "x"):
             return n.inputs[0]
         elif isinstance(n.op, GpuFromHost):
             return grab_lr(n.inputs[0])
         else:
             return None
     else:
         if isinstance(v, Constant) and v.broadcastable == (True, True, True, True):
             return v.dimshuffle(())
Ejemplo n.º 41
0
def insert_gpu_img_acts(node):
    if isinstance(node.op, ImgActs):
        filters, hidacts, irows, icols = node.inputs
        if any_from_gpu(filters, hidacts) or any_gpu_client(*node.outputs):
            gpu_img_acts = GpuImgActs(module_stride=node.op.module_stride,
                                      partial_sum=1)
            return [
                host_from_gpu(
                    gpu_img_acts(
                        gpu_from_host(filters),
                        gpu_from_host(hidacts),
                        irows,
                        icols,
                    ))
            ]
Ejemplo n.º 42
0
 def local_gpu_togpu(node):
     if node.op == gpu_from_host:
         host_input = node.inputs[0]
         if host_input.owner and \
                 hasattr(host_input.owner.op, 'make_gpu_node'):
             try:
                 gpu_inputs = map(gpu_from_host, host_input.owner.inputs)
             except TypeError:
                 return False
             return [host_input.owner.op.make_gpu_node(*gpu_inputs)]
     elif hasattr(node.op, 'make_gpu_node') and \
             all([x.owner and x.owner.op == host_from_gpu
                  for x in node.inputs]):
         gpu_inputs = [x.owner.inputs[0] for x in node.inputs]
         return [host_from_gpu(node.op.make_gpu_node(*gpu_inputs))]
     return False
Ejemplo n.º 43
0
 def grab_lr(v):
     if v.owner is not None:
         n = v.owner
         if (isinstance(n.op, GpuDimShuffle) and
               n.op.new_order == ('x', 'x', 'x', 'x')):
             return host_from_gpu(n.inputs[0])
         elif (isinstance(n.op, DimShuffle) and
               n.op.new_order == ('x', 'x', 'x', 'x')):
             return n.inputs[0]
         elif isinstance(n.op, GpuFromHost):
               return grab_lr(n.inputs[0])
         else:
             return None
     else:
         if (isinstance(v, Constant) and
             v.broadcastable == (True, True, True, True)):
             return v.dimshuffle(())
Ejemplo n.º 44
0
def test_filter_acts_strided():

    # Tests that FilterActs with all possible strides 

    rng = np.random.RandomState([2012,10,9])

    #Each list in shape_list : 
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [[(1, 7, 8, 5),     (1, 2, 2, 16)],
                  [(3, 7, 8, 5),     (3, 3, 3, 16)],
                  [(16, 11, 11, 4),  (16, 4, 4, 16)], 
                  [(3, 20, 20, 3),   (3, 5, 5, 16)],
                  [(3, 21, 21, 3),   (3, 6, 6, 16)],
                  ]

    for test_idx in xrange(len(shape_list)):
        images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32')
        filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32')
        gpu_images = float32_shared_constructor(images,name='images')
        gpu_filters = float32_shared_constructor(filters,name='filters')
        print "test case %d..."%(test_idx+1) 
        
        for ii in xrange(filters.shape[1]):
            stride = ii + 1
            
            output = FilterActs(stride=stride)(gpu_images, gpu_filters)
            output = host_from_gpu(output)
            f = function([], output)
            output_val = f()
        
            output_python = FilterActs_python(images,filters,stride)
                        
            if np.abs(output_val - output_python).max() > 8.6e-6:
                assert type(output_val) == type(output_python)
                assert output_val.dtype == output_python.dtype
                if output_val.shape != output_python.shape:
                    print 'cuda-convnet shape: ',output_val.shape
                    print 'python conv shape: ',output_python.shape
                    assert False
                err = np.abs(output_val - output_python)
                print 'stride %d'%stride
                print 'absolute error range: ', (err.min(), err.max())
                print 'mean absolute error: ', err.mean()
                print 'cuda-convnet value range: ', (output_val.min(), output_val.max())
                print 'python conv value range: ', (output_python.min(), output_python.max())
Ejemplo n.º 45
0
def make_bwd_fun(recurrent_transform):
  y_p = recurrent_transform.y_p
  z_re, state_updates = recurrent_transform.step(y_p)
  custom_vars = recurrent_transform.get_sorted_custom_vars()
  state_vars_prev = recurrent_transform.get_sorted_state_vars()

  Dz_re = recurrent_transform.tt.fmatrix("Dz_re")
  state_var_new_grads = {state_updates[v]: v.type("D_" + v.name) for v in state_vars_prev}
  state_var_new_grads_list = [state_var_new_grads[state_updates[k]] for k in state_vars_prev]
  known_grads = {z_re: Dz_re}
  known_grads.update(state_var_new_grads)
  if recurrent_transform.force_gpu:
    # We need the symbolic host representation.
    # See HostFromGpu.grad(). It expects that the output_grads are on the host, i.e. from type T.TensorType.
    # When this is taken out of known_grads, it will fail because they are all CudaNdarrayType.
    # This should anyway be optimized all away and fully taken to the GPU in the final function.
    for k, v in known_grads.items():
      known_grads[k] = theano_cuda.host_from_gpu(v)

  all_wrt = [y_p] + custom_vars + state_vars_prev
  all_grads = T.grad(None, all_wrt, known_grads=OrderedDict(known_grads), disconnected_inputs="ignore")
  assert len(all_grads) == 1 + len(custom_vars) + len(state_vars_prev)
  Dy_p = all_grads[0]
  custom_grads = all_grads[1:len(custom_vars)+1]
  state_var_prev_grads = all_grads[len(custom_vars)+1:]

  out_Dy_p = recurrent_transform.layer.shared(value=numpy.zeros((1,1),dtype="float32"), name="out_Dy_p")
  out_custom_grads = [recurrent_transform.layer.shared(value=numpy.zeros([1] * var.ndim, dtype="float32"), name="out_D_" + var.name) for var in custom_vars]
  out_state_var_prev_grads = [recurrent_transform.layer.shared(value=numpy.zeros([1] * var.ndim, dtype="float32"), name="out_D_" + var.name) for var in state_vars_prev]

  updates = [(out_Dy_p, Dy_p)]
  updates += [(out, out + grad) for out, grad in zip(out_custom_grads, custom_grads)]  # we accumulate the custom input grads
  updates += [(out, grad) for out, grad in zip(out_state_var_prev_grads, state_var_prev_grads)]
  bwd_fun = theano.function(inputs=[y_p] + custom_vars + state_vars_prev + [Dz_re] + state_var_new_grads_list,
                            outputs=[],
                            updates=updates,
                            on_unused_input="ignore")

  # Before we can accumulate the custom input grads, we need to initialize them with 0.
  custom_reset_updates = [(out, T.zeros_like(var)) for out, var in zip(out_custom_grads, custom_vars)]
  custom_reset_fn = theano.function(inputs=custom_vars, outputs=None, updates=custom_reset_updates)

  if debug_function_hook:
    bwd_fun = debug_make_theano_function_wrapper(bwd_fun, "att_%i_bwd" % id(recurrent_transform), debug_function_hook, [])
  return bwd_fun, custom_reset_fn, out_Dy_p, out_custom_grads + out_state_var_prev_grads
Ejemplo n.º 46
0
def make_bwd_fun(recurrent_transform):
  y_p = recurrent_transform.y_p
  z_re, state_updates = recurrent_transform.step(y_p)
  custom_vars = recurrent_transform.get_sorted_custom_vars()
  state_vars_prev = recurrent_transform.get_sorted_state_vars()

  Dz_re = recurrent_transform.tt.fmatrix("Dz_re")
  state_var_new_grads = {state_updates[v]: v.type("D_" + v.name) for v in state_vars_prev}
  state_var_new_grads_list = [state_var_new_grads[state_updates[k]] for k in state_vars_prev]
  known_grads = {z_re: Dz_re}
  known_grads.update(state_var_new_grads)
  if recurrent_transform.force_gpu:
    # We need the symbolic host representation.
    # See HostFromGpu.grad(). It expects that the output_grads are on the host, i.e. from type T.TensorType.
    # When this is taken out of known_grads, it will fail because they are all CudaNdarrayType.
    # This should anyway be optimized all away and fully taken to the GPU in the final function.
    for k, v in known_grads.items():
      known_grads[k] = theano_cuda.host_from_gpu(v)

  all_wrt = [y_p] + custom_vars + state_vars_prev
  all_grads = T.grad(None, all_wrt, known_grads=OrderedDict(known_grads), disconnected_inputs="ignore")
  assert len(all_grads) == 1 + len(custom_vars) + len(state_vars_prev)
  Dy_p = all_grads[0]
  custom_grads = all_grads[1:len(custom_vars)+1]
  state_var_prev_grads = all_grads[len(custom_vars)+1:]

  out_Dy_p = recurrent_transform.layer.shared(value=numpy.zeros((1,1),dtype="float32"), name="out_Dy_p")
  out_custom_grads = [recurrent_transform.layer.shared(value=numpy.zeros([1] * var.ndim, dtype="float32"), name="out_D_" + var.name) for var in custom_vars]
  out_state_var_prev_grads = [recurrent_transform.layer.shared(value=numpy.zeros([1] * var.ndim, dtype="float32"), name="out_D_" + var.name) for var in state_vars_prev]

  updates = [(out_Dy_p, Dy_p)]
  updates += [(out, out + grad) for out, grad in zip(out_custom_grads, custom_grads)]  # we accumulate the custom input grads
  updates += [(out, grad) for out, grad in zip(out_state_var_prev_grads, state_var_prev_grads)]
  bwd_fun = theano.function(inputs=[y_p] + custom_vars + state_vars_prev + [Dz_re] + state_var_new_grads_list,
                            outputs=[],
                            updates=updates,
                            on_unused_input="ignore")

  # Before we can accumulate the custom input grads, we need to initialize them with 0.
  custom_reset_updates = [(out, T.zeros_like(var)) for out, var in zip(out_custom_grads, custom_vars)]
  custom_reset_fn = theano.function(inputs=custom_vars, outputs=None, updates=custom_reset_updates)

  if debug_function_hook:
    bwd_fun = debug_make_theano_function_wrapper(bwd_fun, "att_%i_bwd" % id(recurrent_transform), debug_function_hook, [])
  return bwd_fun, custom_reset_fn, out_Dy_p, out_custom_grads + out_state_var_prev_grads
Ejemplo n.º 47
0
    def lmul(self, x):
        """
        dot(x, A)
        aka, do convolution with input image x

        """

        cpu = 'Cuda' not in str(type(x))

        if cpu:
            x = gpu_from_host(x)

        # x must be formatted as channel, topo dim 0, topo dim 1, batch_index
        # for use with FilterActs
        assert x.ndim == 4
        x_axes = self.input_axes
        assert len(x_axes) == 4

        op_axes = ('c', 0, 1, 'b')

        if tuple(x_axes) != op_axes:
            x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes])

        x = gpu_contiguous(x)

        rval = FilterActs(self.pad, self.partial_sum)(x, self._filters)

        # Format the output based on the output space
        rval_axes = self.output_axes
        assert len(rval_axes) == 4

        if tuple(rval_axes) != op_axes:
            rval = rval.dimshuffle(
                *[op_axes.index(axis) for axis in rval_axes])

        if cpu:
            rval = host_from_gpu(rval)

        return rval
Ejemplo n.º 48
0
    def lmul(self, x):
        """
        dot(x, A)
        aka, do convolution with input image x

        """

        cpu = 'Cuda' not in str(type(x))

        if cpu:
            x = gpu_from_host(x)

        # x must be formatted as channel, topo dim 0, topo dim 1, batch_index
        # for use with FilterActs
        assert x.ndim == 4
        x_axes = self.input_axes
        assert len(x_axes) == 4

        op_axes = ('c', 0, 1, 'b')

        if tuple(x_axes) != op_axes:
            x = x.dimshuffle(*[x_axes.index(axis) for axis in op_axes])

        x = gpu_contiguous(x)

        rval = FilterActs(self.pad, self.partial_sum)(x, self._filters)

        # Format the output based on the output space
        rval_axes = self.output_axes
        assert len(rval_axes) == 4

        if tuple(rval_axes) != op_axes:
            rval = rval.dimshuffle(*[op_axes.index(axis) for axis in rval_axes])

        if cpu:
            rval = host_from_gpu(rval)

        return rval
Ejemplo n.º 49
0
def test_match_full_conv_grad():

    # Tests that the gradient of ImageActs with no padding is the same as the
    # gradient of
    # theano's conv2D in full mode after flipping the kernel and tranposing
    # the output and input channels

    rng = np.random.RandomState([2013, 1, 29])

    batch_size = 2
    rows = 6
    cols = 7
    channels = 3
    filter_rows = 5
    filter_cols = filter_rows
    num_filters = 16

    hid_acts = shared(rng.uniform(-1., 1., (num_filters,
                                            rows - filter_rows + 1,
                                            cols - filter_cols + 1,
                                            batch_size)
    ).astype('float32'), name='hidacts')

    filters = shared(rng.uniform(-1., 1., (channels, filter_rows,
        filter_cols, num_filters)).astype('float32'), name='filters')

    gpu_images = gpu_from_host(hid_acts)
    gpu_filters = gpu_from_host(filters)

    output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7)))
    output = host_from_gpu(output)

    images_bc01 = hid_acts.dimshuffle(3,0,1,2)
    filters_bc01 = filters.dimshuffle(3,0,1,2)
    # need to tranpose the kernel stack to do imgActs rather than filterActs
    filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3)
    # In order to do the transpose operation, we must flip the kernels
    # But in theano's conv2d, the kernels get flipped anyway
    # so in this case, we do not flip the kernel

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full')

    output_conv2d = output_conv2d.dimshuffle(1,2,3,0)

    theano_rng = MRG_RandomStreams(5 * 10 * 2013)

    random = theano_rng.normal(size=output_conv2d.shape, dtype=output_conv2d.dtype)

    projected = (output * random).sum()
    projected_conv_2d = (output_conv2d * random).sum()

    grads = T.grad(projected, [hid_acts, filters]) + T.grad(projected_conv_2d, [hid_acts, filters])

    f = function([], grads)

    gi, gf, gi_th, gf_th = f()

    assert gi.shape == gi_th.shape
    diff = np.abs(gi - gi_th).max()
    if diff > 2.9e-6:
        assert False

    diff = np.abs(gf - gf_th).max()
    if diff > 1e-6:
        raise AssertionError(diff)
Ejemplo n.º 50
0
def test_match_full_conv():

    # Tests that running ImageActs with no padding is the same as running
    # theano's conv2D in full mode after flipping the kernel and tranposing
    # the output and input channels
    # In other words, if convolution computes H=XK, we now compute
    # R=HK^T

    rng = np.random.RandomState([2013, 1, 29])

    batch_size = 2
    rows = 6
    cols = 7
    channels = 3
    filter_rows = 5
    filter_cols = filter_rows
    num_filters = 16

    hid_acts = shared(rng.uniform(-1., 1., (num_filters,
                                            rows - filter_rows + 1,
                                            cols - filter_cols + 1,
                                            batch_size)
    ).astype('float32'), name='hidacts')

    filters = shared(rng.uniform(-1., 1., (channels, filter_rows,
        filter_cols, num_filters)).astype('float32'), name='filters')

    gpu_images = gpu_from_host(hid_acts)
    gpu_filters = gpu_from_host(filters)

    output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7)))
    output = host_from_gpu(output)

    images_bc01 = hid_acts.dimshuffle(3,0,1,2)
    filters_bc01 = filters.dimshuffle(3,0,1,2)
    # need to tranpose the kernel stack to do imgActs rather than filterActs
    filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3)
    # In order to do the transpose operation, we must flip the kernels
    # But in theano's conv2d, the kernels get flipped anyway
    # so in this case, we do not flip the kernel

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full')

    output_conv2d = output_conv2d.dimshuffle(1,2,3,0)

    f = function([], [output, output_conv2d])

    output, output_conv2d = f()

    warnings.warn("""test_match_full_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others.""")
    if np.abs(output - output_conv2d).max() > 2.4e-6:
        assert type(output) == type(output_conv2d)
        assert output.dtype == output_conv2d.dtype
        if output.shape != output_conv2d.shape:
            print 'cuda-convnet shape: ',output.shape
            print 'theano shape: ',output_conv2d.shape
            assert False
        err = np.abs(output - output_conv2d)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (output.min(), output.max())
        print 'theano value range: ', (output_conv2d.min(), output_conv2d.max())
        assert False
Ejemplo n.º 51
0
def test_match_valid_conv():

    # Tests that running FilterActs with no padding is the same as running
    # theano's conv2D in valid mode

    rng = np.random.RandomState([2012, 10, 9])

    batch_size = 5
    rows = 10
    cols = 9
    channels = 3
    filter_rows = 4
    filter_cols = filter_rows
    num_filters = 16

    images = shared(rng.uniform(
        -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                    name='images')
    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs()(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    images_bc01 = images.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid')

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    try:
        f = function([], [output, output_conv2d])
    except:
        raise KnownFailureTest(
            "cuda-convnet code depends on an unmerged theano feature.")

    output, output_conv2d = f()

    warnings.warn(
        "test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?"
    )
    if np.abs(output - output_conv2d).max() > 2.4e-6:
        assert type(output) == type(output_conv2d)
        assert output.dtype == output_conv2d.dtype
        if output.shape != output_conv2d.shape:
            print 'cuda-convnet shape: ', output.shape
            print 'theano shape: ', output_conv2d.shape
            assert False
        err = np.abs(output - output_conv2d)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (output.min(), output.max())
        print 'theano value range: ', (output_conv2d.min(),
                                       output_conv2d.max())
        assert False
Ejemplo n.º 52
0
def test_match_full_conv_grad():

    # Tests that the gradient of ImageActs with no padding is the same as the
    # gradient of
    # theano's conv2D in full mode after flipping the kernel and tranposing
    # the output and input channels

    rng = np.random.RandomState([2013, 1, 29])

    batch_size = 2
    rows = 6
    cols = 7
    channels = 3
    filter_rows = 5
    filter_cols = filter_rows
    num_filters = 16

    hid_acts = shared(rng.uniform(
        -1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1,
                  batch_size)).astype('float32'),
                      name='hidacts')

    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(hid_acts)
    gpu_filters = gpu_from_host(filters)

    output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7)))
    output = host_from_gpu(output)

    images_bc01 = hid_acts.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    # need to tranpose the kernel stack to do imgActs rather than filterActs
    filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3)
    # In order to do the transpose operation, we must flip the kernels
    # But in theano's conv2d, the kernels get flipped anyway
    # so in this case, we do not flip the kernel

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full')

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    theano_rng = MRG_RandomStreams(5 * 10 * 2013)

    random = theano_rng.normal(size=output_conv2d.shape,
                               dtype=output_conv2d.dtype)

    projected = (output * random).sum()
    projected_conv_2d = (output_conv2d * random).sum()

    grads = T.grad(projected, [hid_acts, filters]) + T.grad(
        projected_conv_2d, [hid_acts, filters])

    f = function([], grads)

    gi, gf, gi_th, gf_th = f()

    assert gi.shape == gi_th.shape
    diff = np.abs(gi - gi_th).max()
    if diff > 2.9e-6:
        assert False

    diff = np.abs(gf - gf_th).max()
    if diff > 1e-6:
        raise AssertionError(diff)
Ejemplo n.º 53
0
def test_match_full_conv():

    # Tests that running ImageActs with no padding is the same as running
    # theano's conv2D in full mode after flipping the kernel and tranposing
    # the output and input channels
    # In other words, if convolution computes H=XK, we now compute
    # R=HK^T

    rng = np.random.RandomState([2013, 1, 29])

    batch_size = 2
    rows = 6
    cols = 7
    channels = 3
    filter_rows = 5
    filter_cols = filter_rows
    num_filters = 16

    hid_acts = shared(rng.uniform(
        -1., 1., (num_filters, rows - filter_rows + 1, cols - filter_cols + 1,
                  batch_size)).astype('float32'),
                      name='hidacts')

    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(hid_acts)
    gpu_filters = gpu_from_host(filters)

    output = ImageActs()(gpu_images, gpu_filters, as_tensor_variable((6, 7)))
    output = host_from_gpu(output)

    images_bc01 = hid_acts.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    # need to tranpose the kernel stack to do imgActs rather than filterActs
    filters_bc01 = filters_bc01.dimshuffle(1, 0, 2, 3)
    # In order to do the transpose operation, we must flip the kernels
    # But in theano's conv2d, the kernels get flipped anyway
    # so in this case, we do not flip the kernel

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='full')

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    f = function([], [output, output_conv2d])

    output, output_conv2d = f()

    warnings.warn(
        """test_match_full_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others."""
    )
    if np.abs(output - output_conv2d).max() > 2.4e-6:
        assert type(output) == type(output_conv2d)
        assert output.dtype == output_conv2d.dtype
        if output.shape != output_conv2d.shape:
            print 'cuda-convnet shape: ', output.shape
            print 'theano shape: ', output_conv2d.shape
            assert False
        err = np.abs(output - output_conv2d)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (output.min(), output.max())
        print 'theano value range: ', (output_conv2d.min(),
                                       output_conv2d.max())
        assert False
Ejemplo n.º 54
0
def test_match_grad_valid_conv():

    # Tests that weightActs is the gradient of FilterActs
    # with respect to the weights.

    for partial_sum in [0, 1, 4]:
        rng = np.random.RandomState([2012, 10, 9])

        batch_size = 3
        rows = 7
        cols = 9
        channels = 8
        filter_rows = 4
        filter_cols = filter_rows
        num_filters = 16

        images = shared(rng.uniform(-1., 1., (channels, rows, cols,
                                              batch_size)).astype('float32'),
                        name='images')
        filters = rng.uniform(-1., 1.,
                              (channels, filter_rows,
                               filter_cols, num_filters)).astype('float32')
        filters = shared(filters, name='filters')

        gpu_images = gpu_from_host(images)
        gpu_filters = gpu_from_host(filters)

        output = FilterActs(partial_sum=partial_sum)(gpu_images, gpu_filters)
        output = host_from_gpu(output)

        images_bc01 = images.dimshuffle(3, 0, 1, 2)
        filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
        filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

        output_conv2d = conv2d(images_bc01, filters_bc01,
                               border_mode='valid')

        output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

        theano_rng = MRG_RandomStreams(2013 + 1 + 31)

        coeffs = theano_rng.normal(avg=0., std=1.,
                                   size=output_conv2d.shape, dtype='float32')

        cost_conv2d = (coeffs * output_conv2d).sum()

        weights_grad_conv2d = T.grad(cost_conv2d, filters)

        cost = (coeffs * output).sum()
        hid_acts_grad = T.grad(cost, output)

        weights_grad = WeightActs(partial_sum=partial_sum)(
            gpu_images,
            gpu_from_host(hid_acts_grad),
            as_tensor_variable((4, 4))
        )[0]
        weights_grad = host_from_gpu(weights_grad)

        f = function([], [output, output_conv2d, weights_grad,
                          weights_grad_conv2d])

        output, output_conv2d, weights_grad, weights_grad_conv2d = f()

        if np.abs(output - output_conv2d).max() > 8e-6:
            assert type(output) == type(output_conv2d)
            assert output.dtype == output_conv2d.dtype
            if output.shape != output_conv2d.shape:
                print('cuda-convnet shape: ', output.shape)
                print('theano shape: ', output_conv2d.shape)
                assert False
            err = np.abs(output - output_conv2d)
            print('absolute error range: ', (err.min(), err.max()))
            print('mean absolute error: ', err.mean())
            print('cuda-convnet value range: ', (output.min(), output.max()))
            print('theano value range: ', (output_conv2d.min(),
                                           output_conv2d.max()))
            assert False

        warnings.warn(
            "test_match_grad_valid_conv success criterion is not very strict."
            " Can we verify that this is OK? One possibility is that theano"
            " is numerically unstable and Alex's code is better. Probably"
            " theano CPU 64 bit is OK but it's worth checking the others.")

        if np.abs(weights_grad - weights_grad_conv2d).max() > 8.6e-6:
            if type(weights_grad) != type(weights_grad_conv2d):
                raise AssertionError("weights_grad is of type " +
                                     str(weights_grad))
            assert weights_grad.dtype == weights_grad_conv2d.dtype
            if weights_grad.shape != weights_grad_conv2d.shape:
                print('cuda-convnet shape: ', weights_grad.shape)
                print('theano shape: ', weights_grad_conv2d.shape)
                assert False
            err = np.abs(weights_grad - weights_grad_conv2d)
            print('absolute error range: ', (err.min(), err.max()))
            print('mean absolute error: ', err.mean())
            print('cuda-convnet value range: ', (weights_grad.min(),
                                                 weights_grad.max()))
            print('theano value range: ', (weights_grad_conv2d.min(),
                                           weights_grad_conv2d.max()))
            assert False
Ejemplo n.º 55
0
def test_grad():
    rng = np.random.RandomState([2012, 10, 9])
    batch_size = 5
    rows = 10
    cols = 9
    channels = 3
    filter_rows = 4
    filter_cols = filter_rows
    num_filters = 16

    images = shared(rng.uniform(
        -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                    name='images')
    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs()(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    # Proper random projection, like verify_grad does.
    cost_weights = rng.normal(size=(num_filters, rows - filter_rows + 1,
                                    cols - filter_cols + 1, batch_size))
    cost = (constant(cost_weights) * output).sum()

    images_bc01 = images.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

    output_conv2d = conv2d(images_bc01, filters_bc01, border_mode='valid')

    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)
    # XXX: use verify_grad
    images_grad, filters_grad = grad(cost.sum(), [images, filters])
    reference_cost = (constant(cost_weights) * output_conv2d).sum()
    images_conv2d_grad, filters_conv2d_grad = grad(reference_cost,
                                                   [images, filters])
    f = function(
        [],
        [images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad])

    images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad = f()

    warnings.warn(
        """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others."""
    )
    # XXX: Refactor
    if np.abs(images_grad - images_conv2d_grad).max() > 1.15e-5:
        print "=== IMAGES GRADIENT ==="
        assert type(images_grad) == type(images_conv2d_grad)
        assert images_grad.dtype == images_conv2d_grad.dtype
        if images_grad.shape != images_conv2d_grad.shape:
            print 'cuda-convnet shape: ', images_grad.shape
            print 'theano shape: ', images_conv2d_grad.shape
            assert False
        err = np.abs(images_grad - images_conv2d_grad)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (images_grad.min(),
                                             images_grad.max())
        print 'theano value range: ', (images_conv2d_grad.min(),
                                       images_conv2d_grad.max())
        assert False
    if np.abs(filters_grad - filters_conv2d_grad).max() > 1.15e-5:
        print "=== FILTERS GRADIENT ==="
        assert type(filters_grad) == type(filters_conv2d_grad)
        assert filters_grad.dtype == filters_conv2d_grad.dtype
        if filters_grad.shape != filters_conv2d_grad.shape:
            print 'cuda-convnet shape: ', filters_grad.shape
            print 'theano shape: ', filters_conv2d_grad.shape
            assert False
        err = np.abs(filters_grad - filters_conv2d_grad)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (filters_grad.min(),
                                             filters_grad.max())
        print 'theano value range: ', (filters_conv2d_grad.min(),
                                       filters_conv2d_grad.max())
        assert False
Ejemplo n.º 56
0
def test_grad_strided():
    rng = np.random.RandomState([2012, 10, 9])
    batch_size = 5
    rows = 9
    cols = 9
    channels = 3
    filter_rows = 3
    filter_cols = filter_rows
    num_filters = 16
    stride = 3

    images = shared(rng.uniform(
        -1., 1., (channels, rows, cols, batch_size)).astype('float32'),
                    name='images')
    filters = shared(rng.uniform(
        -1., 1.,
        (channels, filter_rows, filter_cols, num_filters)).astype('float32'),
                     name='filters')

    gpu_images = gpu_from_host(images)
    gpu_filters = gpu_from_host(filters)

    output = FilterActs(stride=stride)(gpu_images, gpu_filters)
    output = host_from_gpu(output)

    images_bc01 = images.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters.dimshuffle(3, 0, 1, 2)
    filters_bc01 = filters_bc01[:, :, ::-1, ::-1]

    output_conv2d = conv2d(images_bc01,
                           filters_bc01,
                           border_mode='valid',
                           subsample=(stride, stride))
    output_conv2d = output_conv2d.dimshuffle(1, 2, 3, 0)

    checker = function([], [output, output_conv2d])
    output_numpy, output_conv2d_numpy = checker()
    if output_numpy.shape != output_conv2d_numpy.shape:
        raise AssertionError(
            "theano and cuda convnet follow different conventions for this input size, so we can't test cuda convnet by matching it against theano for these inputs"
        )

    # Proper random projection, like verify_grad does.
    theano_rng = MRG_RandomStreams(2013 * 5 * 4)
    cost_weights = theano_rng.normal(size=output_conv2d.shape,
                                     dtype=output_conv2d.dtype)
    cost = (cost_weights * output).sum()

    # XXX: use verify_grad
    images_grad, filters_grad = grad(cost, [images, filters])
    reference_cost = (cost_weights * output_conv2d).sum()
    images_conv2d_grad, filters_conv2d_grad = grad(reference_cost,
                                                   [images, filters])

    f = function(
        [],
        [images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad])

    images_grad, filters_grad, images_conv2d_grad, filters_conv2d_grad = f()

    warnings.warn(
        """test_match_valid_conv success criterion is not very strict. Can we verify that this is OK?
                     One possibility is that theano is numerically unstable and Alex's code is better.
                     Probably theano CPU 64 bit is OK but it's worth checking the others."""
    )
    # XXX: Refactor
    if np.abs(images_grad - images_conv2d_grad).max() > 1.15e-5:
        print "=== IMAGES GRADIENT ==="
        assert type(images_grad) == type(images_conv2d_grad)
        assert images_grad.dtype == images_conv2d_grad.dtype
        if images_grad.shape != images_conv2d_grad.shape:
            print 'cuda-convnet shape: ', images_grad.shape
            print 'theano shape: ', images_conv2d_grad.shape
            assert False
        err = np.abs(images_grad - images_conv2d_grad)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (images_grad.min(),
                                             images_grad.max())
        print 'theano value range: ', (images_conv2d_grad.min(),
                                       images_conv2d_grad.max())
        assert False
    if np.abs(filters_grad - filters_conv2d_grad).max() > 1e-5:
        print "=== FILTERS GRADIENT ==="
        assert type(filters_grad) == type(filters_conv2d_grad)
        assert filters_grad.dtype == filters_conv2d_grad.dtype
        if filters_grad.shape != filters_conv2d_grad.shape:
            print 'cuda-convnet shape: ', filters_grad.shape
            print 'theano shape: ', filters_conv2d_grad.shape
            assert False
        err = np.abs(filters_grad - filters_conv2d_grad)
        print 'absolute error range: ', (err.min(), err.max())
        print 'mean absolute error: ', err.mean()
        print 'cuda-convnet value range: ', (filters_grad.min(),
                                             filters_grad.max())
        print 'theano value range: ', (filters_conv2d_grad.min(),
                                       filters_conv2d_grad.max())
        assert False