Example #1
0
def local_gpu_multinomial(node):
    if type(node.op) is MultinomialFromUniform:
        p, u = node.inputs
        m, = node.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32' and any([
                i.owner
                and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu)
                for i in node.inputs
        ])):
            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
            return [
                host_from_gpu(gpu_op(*[gpu_from_host(i)
                                       for i in node.inputs])).T
            ]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost)
            and node.inputs[0].owner
            and type(node.inputs[0].owner.op) is MultinomialFromUniform):
        multi = node.inputs[0].owner
        p, u = multi.inputs
        m, = multi.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32'):
            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
            ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T
            # The dimshuffle is on the cpu, but will be moved to the gpu by an opt.
            return [gpu_from_host(ret)]
Example #2
0
    def parse_args(self, bottom, top):
        function_str = self.pythonargs[0]
        top_shape = self.pythonargs[1]

        old_function_str = self.function_str
        old_top_shape = self.top_shape
        self.function_str = function_str
        self.top_shape = top_shape
        if function_str != old_function_str or len(top_shape) != len(
                old_top_shape):
            if old_function_str != '':
                print(
                    'TheanoGPU function string different from cache: recompiling'
                )
            import theano.tensor as T
            import theano
            from theano.sandbox.cuda.basic_ops import gpu_from_host
            x = []
            for i in range(len(bottom)):
                if len(bottom[i].shape) == 1:
                    x.append(T.vector('x%d' % i))
                if len(bottom[i].shape) == 2:
                    x.append(T.matrix('x%d' % i))
                if len(bottom[i].shape) == 3:
                    x.append(T.tensor3('x%d' % i))
                if len(bottom[i].shape) == 4:
                    x.append(T.tensor4('x%d' % i))

            y = eval(function_str)
            self.f = theano.function(x,
                                     gpu_from_host(y),
                                     on_unused_input='ignore')

            if len(self.top_shape) == 1:
                v = T.vector('v')
            elif len(self.top_shape) == 2:
                v = T.matrix('v')
            elif len(self.top_shape) == 3:
                v = T.tensor3('v')
            elif len(self.top_shape) == 4:
                v = T.tensor4('v')
            self.b = []
            for i in range(len(bottom)):
                yg = T.Lop(y, x[i], v)
                self.b.append(
                    theano.function(x + [v],
                                    gpu_from_host(yg),
                                    on_unused_input='ignore'))
Example #3
0
def use_gpu_cumsum(node):
    if type(node.op) is CumOp \
       and node.inputs[0].dtype == 'float32' \
       and node.inputs[0].owner \
       and isinstance(node.inputs[0].owner.op, HostFromGpu):

        if node.op.mode != 'add':
            return None

        axis = node.op.axis
        x = node.inputs[0]

        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            return None

        x = gpu_from_host(x)

        if axis is None and x.ndim > 1:
            x = gpu_flatten(x)

        # ``gpu_cumsum`` assume array has been flattened if needed.
        if axis is None:
            axis = 0

        ret = host_from_gpu(GpuCumsum(axis)(x))
        ret.tag.values_eq_approx = values_eq_approx_high_tol
        return [ret]
Example #4
0
 def local_gpu_minres(node):
     if isinstance(node.op, MinresQLP):
         sw = False
         for inp in node.inputs:
             if inp.owner and inp.owner.op == host_from_gpu:
                 sw = True
         if sw:
             inps = node.inputs
             nw_inps = []
             for inp in inps:
                 if not isinstance(inp.type, CudaNdarrayType):
                     nw_inps.append(gpu_from_host(inp))
                 else:
                     nw_inps.append(inp)
             new_op = node.op
             new_op.gpu = 1
             _new_outs = node.op(*nw_inps)
             new_outs = []
             for out in _new_outs:
                 if isinstance(out.type, CudaNdarrayType):
                     new_outs.append(host_from_gpu(out))
                 else:
                     new_outs.append(out)
             return new_outs
         else:
             return False
Example #5
0
def ctc_cost(acts, labels, input_lengths=None):
    """
  Given sequences of output layer activations and labels, compute the softmax output at each timestep,
  and then compute the CTC cost of each sequence with respect to its corresponding label sequence.

  :param acts: Tensor of pre-softmax activations, with shape=[maxInputSeqLen, batchSize, targetN],
      where
      maxInputSeqLen >= the length of the longest input sequence.
      batchSize is the number of sequences being simultaneously computed / trained.
      targetN is the number of network outputs (<blank> is always target 0).

  :param labels: Matrix of training labels, with shape=[batchSize, maxOutputSeqLen]. 
      Since <blank> is always output 0, labels should be > 0 (targets) or negative (ignored). 
      maxOutputSeqLen >= the length of the longest target sequence (excluding <blank>s, 
      which CTC alignment adds). Label values < 0 at any location are ignored, 
      so [1], [-1, 1, -1], and [-1, -1, 1] are treated the same.

  :param input_lengths: Vector of input sequence lengths, with shape=[batchSize].
      For sequence s (0 <= s < batchSize), CTC is calculated on acts[0:input_lengths[s], s, :].
      If input_lengths is None, then all sequences in the batch are assumed to have length maxInputSeqLen.

  :return: Vector of CTC costs, with shape=[batchSize]
  """
    # This should be properly integrated into the theano optimization catalog.
    # Until then, this forces the choice based on device configuration.
    if theano.config.device.startswith(
            "gpu") or theano.sandbox.cuda.cuda_enabled:
        if not isinstance(acts.type,
                          CudaNdarrayType):  # if not already on the device
            acts = gpu_from_host(acts)  # this should get optimized away
        return GpuCtc()(acts, labels, input_lengths)
    else:
        return CpuCtc()(acts, labels, input_lengths)
Example #6
0
def use_gpu_images2neibs(node):
    if type(node.op) is Images2Neibs:
        return [
            host_from_gpu(
                gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode)
            )
        ]
Example #7
0
 def local_gpu_minres(node):
     if isinstance(node.op, MinresQLP):
         sw = False
         for inp in node.inputs:
             if inp.owner and inp.owner.op == host_from_gpu:
                 sw = True
         if sw:
             inps = node.inputs
             nw_inps = []
             for inp in inps:
                 if not isinstance(inp.type, CudaNdarrayType):
                     nw_inps.append(gpu_from_host(inp))
                 else:
                     nw_inps.append(inp)
             new_op = node.op
             new_op.gpu = 1
             _new_outs = node.op(*nw_inps)
             new_outs = []
             for out in _new_outs:
                 if isinstance(out.type, CudaNdarrayType):
                     new_outs.append(host_from_gpu(out))
                 else:
                     new_outs.append(out)
             return new_outs
         else:
             return False
Example #8
0
def use_gpu_images2neibs(node):
    if (type(node.op) is Images2Neibs and
        node.inputs[0].dtype == 'float32' and
        node.op.mode in ['valid', 'wrap_centered']):
        return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]),
                                               node.inputs[1], node.inputs[2],
                                               mode=node.op.mode))]
Example #9
0
    def __init__(self, **kwargs):
        
        self.num_layers = kwargs.get('num_layers', None)
        self.num_filters = kwargs.get('num_filters', None)
        self.filter_size = kwargs.get('filter_size', None)
        
        self.rng = kwargs.get('rng', np.random.RandomState(42))
        self.load_folder = kwargs.get('weights_folder', None)
        self.activation = kwargs.get('activation', 'relu')
        self.cost_func = kwargs.get('cost_func', 'MSE')  
        
        #Initialize (or load) the weights for the network
        if(self.load_folder == None):
            try:
                assert (self.num_layers != None) and (self.num_filters != None) and (self.filter_size != None)
                self.__define_network()
                self.__init_weights()
            except:
                print "ERROR: Insufficient parameters for generating new network"
                sys.exit(0)
        else:
            self.__load_weights()

        #Input and Target variables for symbolic representation of network
        self.X = T.tensor4('X')            
        
        #Create the network model
        self.__model()
        
        if(theano.config.device == 'cpu'):
            #Create a predicter based on this network model
            self.forward = theano.function(inputs=[self.X], outputs=self.out, allow_input_downcast=True)
        else:
            #Create a predicter based on this network model
            self.forward = theano.function(inputs=[self.X], outputs=Out(gpu_from_host(self.out), borrow=True), allow_input_downcast=True)
Example #10
0
def use_gpu_images2neibs(node):
    if (type(node.op) is Images2Neibs and
        node.inputs[0].dtype == 'float32' and
        node.op.mode in ['valid', 'ignore_borders',
                         'wrap_centered']):
        return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]),
                                               node.inputs[1], node.inputs[2],
                                               mode=node.op.mode))]
Example #11
0
def ctc_cost(acts, input_lengths, flat_labels, label_lengths):
  # This should be properly integrated into the theano optimization catalog.
  # Until then, this forces the choice based on device configuration.
  if theano.config.device.startswith("gpu") or theano.sandbox.cuda.cuda_enabled:
    if not isinstance(acts.type, CudaNdarrayType): # if not already on the device
      acts = gpu_from_host(acts)  # this should get optimized away
    return gpu_ctc_cost(acts, input_lengths, flat_labels, label_lengths)
  else:
    return cpu_ctc_cost(acts, input_lengths, flat_labels, label_lengths)
Example #12
0
def use_gpu_images2neibs(node):
    if type(node.op) is Images2Neibs:
        return [
            host_from_gpu(
                gpu_images2neibs(gpu_from_host(node.inputs[0]),
                                 node.inputs[1],
                                 node.inputs[2],
                                 mode=node.op.mode))
        ]
Example #13
0
def local_gpu_multinomial(node):
    # TODO : need description for function
    if type(node.op) is MultinomialFromUniform:
        if len(node.inputs) == 2:
            p, u = node.inputs
            n_samples = 1
        else:
            p, u, n_samples = node.inputs
        try:
            if get_scalar_constant_value(n_samples) != 1:
                return None
        except NotScalarConstantError:
            return None
        m, = node.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32' and any([
                i.owner
                and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu)
                for i in node.inputs
        ])):
            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
            return [
                host_from_gpu(gpu_op(*[gpu_from_host(i) for i in [p, u]])).T
            ]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost)
            and node.inputs[0].owner
            and type(node.inputs[0].owner.op) is MultinomialFromUniform):
        multi = node.inputs[0].owner
        if len(node.inputs) == 2:
            p, u = node.inputs
            n_samples = 1
        else:
            p, u, n_samples = node.inputs
        try:
            if get_scalar_constant_value(n_samples) != 1:
                return None
        except NotScalarConstantError:
            return None
        m, = multi.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32'):
            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
            ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T
            # The dimshuffle is on the cpu, but will be moved to the
            # gpu by an opt.
            return [gpu_from_host(ret)]
Example #14
0
def local_gpu_multinomial(node):
    if type(node.op) is MultinomialFromUniform:
        p, u = node.inputs
        m, = node.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32' and
            any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu)
                 for i in node.inputs])):
            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
            return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
        node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform):
        multi = node.inputs[0].owner
        p, u = multi.inputs
        m, = multi.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32'):
            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
            ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T
            # The dimshuffle is on the cpu, but will be moved to the gpu by an opt.
            return [gpu_from_host(ret)]
Example #15
0
def local_gpu_argmax(node):
    if type(node.op) is KArgmax:
        p, = node.inputs
        vals, indx, = node.outputs
        if (p.dtype == vals.dtype == 'float32' and
            any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])):
            gpu_op = GpuKArgmax(node.op.K)
            ret_vals, ret_indx = gpu_op(gpu_from_host(p))
            return [host_from_gpu(ret_vals), T.cast(host_from_gpu(ret_indx), "int32")]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
        node.inputs[0].owner and type(node.inputs[0].owner.op)
        is KArgmax):
        multi = node.inputs[0].owner
        p, = multi.inputs
        vals, indx, = multi.outputs
        if (p.dtype == vals.dtype == 'float32'):
            gpu_op = GpuKArgmax(node.inputs[0].owner.op.K)
            ret_vals, ret_indx = gpu_op(gpu_from_host(p)) 
            return [gpu_from_host(ret_vals), gpu_from_host(ret_indx)]
Example #16
0
def local_assigner(node):
    if type(node.op) is Assigner:
        p, indx, gr, = node.inputs
        vals, = node.outputs
        if (p.dtype == vals.dtype == 'float32' and
            any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])):
            gpu_op = GpuAssigner()
            ret = gpu_op(gpu_from_host(p),indx,gpu_from_host(gr))
            return [host_from_gpu(ret),]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
        node.inputs[0].owner and type(node.inputs[0].owner.op)
        is Assigner):
        multi = node.inputs[0].owner
        p,indx,gr = multi.inputs
        vals, = multi.outputs
        if (p.dtype == vals.dtype == 'float32'):
            gpu_op = GpuAssigner()
            ret_vals = gpu_op(gpu_from_host(p),indx,gpu_from_host(gr)) 
            return [gpu_from_host(ret_vals)]
Example #17
0
def local_gpu_multinomial(node):
    # TODO : need description for function
    if type(node.op) is MultinomialFromUniform:
        if len(node.inputs) == 2:
            p, u = node.inputs
            n_samples = 1
        else:
            p, u, n_samples = node.inputs
        try:
            if get_scalar_constant_value(n_samples) != 1:
                return None
        except NotScalarConstantError:
            return None
        m, = node.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32' and
            any([i.owner and isinstance(i.owner.op,
                                        theano.sandbox.cuda.HostFromGpu)
                 for i in node.inputs])):
            gpu_op = GpuMultinomialFromUniform(node.op.odtype)
            return [host_from_gpu(gpu_op(*[gpu_from_host(i)
                                           for i in [p, u]])).T]
    if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and
            node.inputs[0].owner and
            type(node.inputs[0].owner.op) is MultinomialFromUniform):
        multi = node.inputs[0].owner
        if len(node.inputs) == 2:
            p, u = node.inputs
            n_samples = 1
        else:
            p, u, n_samples = node.inputs
        try:
            if get_scalar_constant_value(n_samples) != 1:
                return None
        except NotScalarConstantError:
            return None
        m, = multi.outputs
        if (p.dtype == u.dtype == m.dtype == 'float32'):
            gpu_op = GpuMultinomialFromUniform(multi.op.odtype)
            ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T
            # The dimshuffle is on the cpu, but will be moved to the
            # gpu by an opt.
            return [gpu_from_host(ret)]
Example #18
0
    def parse_args(self, bottom, top):
        function_str = self.pythonargs[0]
        top_shape = self.pythonargs[1]

        if self.function_str != function_str or self.top_shape != top_shape:
            self.function_str = function_str
            self.top_shape = top_shape

            import theano.tensor as T
            import theano
            from theano.sandbox.cuda.basic_ops import gpu_from_host
            x = []
            for i in range(len(bottom)):
                if len(bottom[i].shape) == 1:
                    x.append(T.vector('x%d' % i))
                if len(bottom[i].shape) == 2:
                    x.append(T.matrix('x%d' % i))
                if len(bottom[i].shape) == 3:
                    x.append(T.tensor3('x%d' % i))
                if len(bottom[i].shape) == 4:
                    x.append(T.tensor4('x%d' % i))

            y = eval(function_str)
            self.f = theano.function(x,
                                     gpu_from_host(y),
                                     on_unused_input='ignore')

            if len(self.top_shape) == 1:
                v = T.vector('v')
            elif len(self.top_shape) == 2:
                v = T.matrix('v')
            elif len(self.top_shape) == 3:
                v = T.tensor3('v')
            elif len(self.top_shape) == 4:
                v = T.tensor4('v')
            self.b = []
            for i in range(len(bottom)):
                yg = T.Lop(y, x[i], v)
                self.b.append(
                    theano.function(x + [v],
                                    gpu_from_host(yg),
                                    on_unused_input='ignore'))
Example #19
0
def ctc_cost(acts, input_lengths, flat_labels, label_lengths):
    # This should be properly integrated into the theano optimization catalog.
    # Until then, this forces the choice based on device configuration.
    if theano.config.device.startswith(
            "gpu") or theano.sandbox.cuda.cuda_enabled:
        if not isinstance(acts.type,
                          CudaNdarrayType):  # if not already on the device
            acts = gpu_from_host(acts)  # this should get optimized away
        return gpu_ctc_cost(acts, input_lengths, flat_labels, label_lengths)
    else:
        return cpu_ctc_cost(acts, input_lengths, flat_labels, label_lengths)
Example #20
0
def use_gpu_images2neibs(node):
    if (
        type(node.op) is Images2Neibs
        and node.inputs[0].dtype == "float32"
        and node.op.mode in ["valid", "ignore_borders", "wrap_centered"]
    ):
        return [
            host_from_gpu(
                gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode)
            )
        ]
Example #21
0
    def parse_args(self, bottom, top):
        function_str = self.pythonargs[0]
        top_shape = self.pythonargs[1]

        old_function_str = self.function_str
        old_top_shape = self.top_shape
        self.function_str = function_str
        self.top_shape = top_shape
        if function_str != old_function_str or len(top_shape) != len(old_top_shape):
            if old_function_str != '':
                print('TheanoGPU function string different from cache: recompiling')
            import theano.tensor as T
            import theano
            from theano.sandbox.cuda.basic_ops import gpu_from_host
            x = []
            for i in range(len(bottom)):
                if len(bottom[i].shape) == 1:
                    x.append(T.vector('x%d' % i))
                if len(bottom[i].shape) == 2:
                    x.append(T.matrix('x%d' % i))
                if len(bottom[i].shape) == 3:
                    x.append(T.tensor3('x%d' % i))
                if len(bottom[i].shape) == 4:
                    x.append(T.tensor4('x%d' % i))

            y = eval(function_str)
            self.f = theano.function(x, gpu_from_host(y), on_unused_input='ignore')

            if len(self.top_shape) == 1:
                v = T.vector('v')
            elif len(self.top_shape) == 2:
                v = T.matrix('v')
            elif len(self.top_shape) == 3:
                v = T.tensor3('v')
            elif len(self.top_shape) == 4:
                v = T.tensor4('v')
            self.b = []
            for i in range(len(bottom)):
                yg = T.Lop(y, x[i], v)
                self.b.append(theano.function(x + [v], gpu_from_host(yg), on_unused_input='ignore'))
def compileModel(data, nInputs, nOutputs, hiddenLayersSize = [1200, 1200], dropoutRates = [0.2, 0.5, 0.5],
                  activation = 'relu', weightInitMode = 'normal', regularizer = 0.0001):
    """
    Creates a symbolic model given the specified parameters using Theano
    
    Output:
    A list containing three the training, validation and test compiled functions of Theano
    """
    
    
    np.random.seed(815)
    
    x = T.matrix('x')
    y = T.wvector('y')
    learningRate = T.scalar('learningRate')
    regularization = T.scalar('regularization')
    
    #Data sets
    train_x, train_y = data[0]
    valid_x, valid_y = data[1]
    test_x, test_y = data[2]
    
    nnet = MLP(x, nInputs, hiddenLayersSize, nOutputs, dropoutRates = dropoutRates,
                activation = activation, weightInitMode = weightInitMode)
    
    loss = nnet.loss(y, regularization)
    error = nnet.error(y)
    
    gParams = T.grad(loss, nnet.params)
    
    weightUpdates = [(param, param - learningRate * gParam) for param, gParam in zip(nnet.params, gParams)]    
    
    
    batchIndicesVecctor = T.ivector('batchIndicesVecctor')
    trainF = function([batchIndicesVecctor, learningRate, regularization], Out(sbasic.gpu_from_host(loss), borrow = True), updates = weightUpdates, givens = {x: train_x[batchIndicesVecctor], y: train_y[batchIndicesVecctor]})
    validF = function([batchIndicesVecctor], Out(sbasic.gpu_from_host(T.cast(error, T.config.floatX)), borrow = True), givens = {x: valid_x[batchIndicesVecctor], y: valid_y[batchIndicesVecctor]})
    testF = function([batchIndicesVecctor], Out(sbasic.gpu_from_host(T.cast(error, T.config.floatX)), borrow = True), givens = {x: test_x[batchIndicesVecctor], y: test_y[batchIndicesVecctor]})
    
    return [trainF, validF, testF]
Example #23
0
    def parse_args(self, bottom, top):
        function_str = self.pythonargs[0]
        top_shape = self.pythonargs[1]

        if self.function_str != function_str or self.top_shape != top_shape:
            self.function_str = function_str
            self.top_shape = top_shape

            import theano.tensor as T
            import theano
            from theano.sandbox.cuda.basic_ops import gpu_from_host
            x = []
            for i in range(len(bottom)):
                if len(bottom[i].shape) == 1:
                    x.append(T.vector('x%d' % i))
                if len(bottom[i].shape) == 2:
                    x.append(T.matrix('x%d' % i))
                if len(bottom[i].shape) == 3:
                    x.append(T.tensor3('x%d' % i))
                if len(bottom[i].shape) == 4:
                    x.append(T.tensor4('x%d' % i))

            y = eval(function_str)
            self.f = theano.function(x, gpu_from_host(y), on_unused_input='ignore')

            if len(self.top_shape) == 1:
                v = T.vector('v')
            elif len(self.top_shape) == 2:
                v = T.matrix('v')
            elif len(self.top_shape) == 3:
                v = T.tensor3('v')
            elif len(self.top_shape) == 4:
                v = T.tensor4('v')
            self.b = []
            for i in range(len(bottom)):
                yg = T.Lop(y, x[i], v)
                self.b.append(theano.function(x + [v], gpu_from_host(yg), on_unused_input='ignore'))
Example #24
0
        def grad_step(*args):

            idx = TT.cast(args[0], 'int32')
            nw_inps = [x[idx * options['cbs']: \
                         (idx + 1) * options['cbs']]
                       for x in loc_inputs]
            replace = dict(zip(model.inputs, nw_inps))
            nw_cost = safe_clone(model.train_cost, replace=replace)
            gs = TT.grad(nw_cost, model.params)
            nw_gs = [op + np for op, np in zip(args[2: 2 + n_params], gs)]
            _gs = [x for x in gs]
            _nw_gs = [gpu_from_host(g) for g in nw_gs]
            nw_gs = ifelse(comp_grad, _nw_gs, _gs, gpu=True)
            nw_gs = [x.type.filter_variable(y) for x,y in zip(args[2:],nw_gs)]
            return [args[0] + const(1), args[1] + nw_cost] + nw_gs
Example #25
0
def test_gpualloc_output_to_gpu():
    a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32')
    a = tcn.shared_constructor(a_val)

    b = T.fscalar()
    f = theano.function([b], T.ones_like(a)+b, mode=mode_without_gpu)
    f_gpu = theano.function([b], B.gpu_from_host(T.ones_like(a))+b, mode=mode_with_gpu)

    print f.maker.env.toposort()
    print f_gpu.maker.env.toposort()
    print f(2)
    print f_gpu(2)

    assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==1
    assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==1

    assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9))
    assert numpy.allclose(f(5),f_gpu(5))
Example #26
0
 def local_gpu_forloop(node):
     if isinstance(node.op, forloop):
         sw = False
         for inp in node.inputs:
             if inp.owner and inp.owner.op == host_from_gpu:
                 sw = True
         if sw:
             inps = node.inputs
             nw_inps = []
             for inp in inps:
                 if not isinstance(inp.type, CudaNdarrayType):
                     nw_inps.append(gpu_from_host(inp))
                 else:
                     nw_inps.append(inp)
             new_outs = node.op(*nw_inps)
             return [host_from_gpu(x) for x in new_outs]
         else:
             return False
Example #27
0
def use_gpu_cumsum(node):
    if type(node.op) is CumsumOp \
       and node.inputs[0].dtype == 'float32' \
       and node.inputs[0].owner \
       and isinstance(node.inputs[0].owner.op, HostFromGpu):

        axis = node.op.axis
        x = node.inputs[0]

        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            return None

        x = gpu_from_host(x)

        if axis is None and x.ndim > 1:
            x = GpuFlatten()(x)

        # ``gpu_cumsum`` assume array has been flattened if needed.
        if axis is None:
            axis = 0

        return [host_from_gpu(GpuCumsum(axis)(x))]
Example #28
0
def use_gpu_cumsum(node):
    if type(node.op) is CumsumOp \
       and node.inputs[0].dtype == 'float32' \
       and node.inputs[0].owner \
       and isinstance(node.inputs[0].owner.op, HostFromGpu):

        axis = node.op.axis
        x = node.inputs[0]

        if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS:
            return None

        x = gpu_from_host(x)

        if axis is None and x.ndim > 1:
            x = GpuFlatten()(x)

        # ``gpu_cumsum`` assume array has been flattened if needed.
        if axis is None:
            axis = 0

        return [host_from_gpu(GpuCumsum(axis)(x))]
Example #29
0
    filter_shape = (64, 8, 3, 3)
    padding = "valid" # (1, 1)
    strides = (1, 1)

    # input_shape = (32, 16, 48, 48)
    # filter_shape = (24, 16, 3, 3)
    # padding = (1, 1)
    # strides = (1, 1)

    print "fprop"
    x = theano.shared(np.random.normal(0, 1, input_shape).astype(theano.config.floatX))
    w = theano.shared(np.random.normal(0, 1, filter_shape).astype(theano.config.floatX))

    y_cudnn = dnn.dnn_conv(x, w, border_mode=padding, subsample=strides, conv_mode='cross')
    y_nervana_raw = nervana_conv(x, w, padding=padding, strides=strides)
    y_nervana = gpu_from_host(y_nervana_raw)

    val_cudnn = np.array(y_cudnn.eval())
    val_nervana = np.array(y_nervana.eval())

    assert np.allclose(val_cudnn, val_nervana)

    print "fprop without dimshuffle"
    x_nodimshuffle = theano.shared(x.get_value().transpose(1, 2, 3, 0)) # c01b
    w_nodimshuffle = theano.shared(w.get_value().transpose(1, 2, 3, 0)) # c01b

    y_nervana_nodimshuffle = gpu_from_host(nervana_conv(x_nodimshuffle, w_nodimshuffle, padding=padding, strides=strides, dimshuffle=False))

    val_nervana_nodimshuffle = np.array(y_nervana_nodimshuffle.eval()).transpose(3, 0, 1, 2)

    assert np.allclose(val_nervana, val_nervana_nodimshuffle)
Example #30
0
def train_nn(data_file_name, reg_lambda=0.01, learning_rate=0.01, n_eigs=100, 
        n_neurons_per_layer=100, batch_size=100, display=True):
    train_data, test_data, file_names = old_load_images(data_file_name)
    eig_face = EigenFace.from_file(train_data[0], data_file_name, n_eigs)
    train_data[0] = get_face_space(data_file_name, 'train_x', train_data[0],
                                   eig_face)
    test_data[0] = get_face_space(data_file_name, 'test_x', test_data[0],
                                  eig_face)
    n_features, n_training_examples = train_data[0].shape
    real_scores = test_data[1].T.tolist()

    train_data = to_theano_shared(train_data)
    test_data = to_theano_shared(test_data)

    rng = numpy.random.RandomState(1234)
    x = T.matrix('x')
    y = T.vector('y')

    mlp = MLP(rng, x, n_features, n_neurons_per_layer, n_training_examples)
    cost = mlp.cost(y) + reg_lambda * mlp.L2_sqr

    test_model =theano.function([],
            outputs=[cost, mlp.output],
            givens={x:test_data[0][:], y:test_data[1][:]})

    g_params = []
    for param in mlp.params:
        g_param = T.grad(cost, param)
        g_params.append(g_param)

    updates = {}

    for param, g_param in zip(mlp.params, g_params):
        updates[param] = param - learning_rate * g_param

    train_model = theano.function([],
            outputs=theano.Out(gpu_from_host(cost), borrow=True), updates=updates,
            givens={x:train_data[0][:], y:train_data[1][:]})

    current_cost = numpy.asarray(train_model())
    logging.info('initial cost %f' % current_cost)
    old_cost = 0
    iterations = 0
    logging.info('beginning stochastic gradient descent')
    while ((abs(current_cost- old_cost)) > 0.001):
        old_cost = current_cost
        current_cost = numpy.asarray(train_model())
        if iterations % 10 == 0:
            logging.info('iteration % 9d cost % 9f' % (iterations, current_cost))
        iterations += 1

    error, predictions = test_model()

    # Print the results
    logging.info('training cost minimised: %f' % current_cost)
    logging.info('test error: %f' % error)
    
    predictions = predictions[0].tolist()
    logging.debug('predictions %s', str(predictions))
    pearsons = pearsonr(real_scores, predictions)
    logging.info('pearsons correlation: %f, %f' % pearsons)
    # Save our weights should we ever need them again
    plot_title_data = (n_neurons_per_layer, learning_rate, reg_lambda,
            pearsons[0])
    plot_correlation(real_scores, predictions, file_names, 'neural network with %d neurons' \
            'learning rate %f and reg-lambda %f pearsons %f' % plot_title_data,
            'nn', show=True, pearsons=pearsons)
Example #31
0
    # padding = (1, 1)
    # strides = (1, 1)

    print "fprop"
    x = theano.shared(
        np.random.normal(0, 1, input_shape).astype(theano.config.floatX))
    w = theano.shared(
        np.random.normal(0, 1, filter_shape).astype(theano.config.floatX))

    y_cudnn = dnn.dnn_conv(x,
                           w,
                           border_mode=padding,
                           subsample=strides,
                           conv_mode='cross')
    y_nervana_raw = nervana_conv(x, w, padding=padding, strides=strides)
    y_nervana = gpu_from_host(y_nervana_raw)

    val_cudnn = np.array(y_cudnn.eval())
    val_nervana = np.array(y_nervana.eval())

    assert np.allclose(val_cudnn, val_nervana)

    print "fprop without dimshuffle"
    x_nodimshuffle = theano.shared(x.get_value().transpose(1, 2, 3, 0))  # c01b
    w_nodimshuffle = theano.shared(w.get_value().transpose(1, 2, 3, 0))  # c01b

    y_nervana_nodimshuffle = gpu_from_host(
        nervana_conv(x_nodimshuffle,
                     w_nodimshuffle,
                     padding=padding,
                     strides=strides,
    x_cc.set_value(x_val.transpose(
        1, 2, 3,
        0))  # cuda-convnet expects the batch size in the trailing dimension.
    w_cc.set_value(w_val[:, :, ::-1, ::-1].transpose(
        1, 2, 3, 0))  # cuda-convnet doesn't flip the filters,
    # trailing dimension should be number of output channels.
    # by doing these transformations in advance on the host, these differences
    # cannot affect running times of the convolutions themselves.

    y_theano = conv.conv2d(x, w, image_shape=shape_x, filter_shape=shape_w)
    y_cc = filter_acts_op(x_cc, w_cc)
    y_fft = fftconv.conv2d_fft(x, w, image_shape=shape_x, filter_shape=shape_w)

    print "  compiling: Theano"
    f_theano = theano.function(
        [], gpu_from_host(y_theano))  # don't transfer to host

    print "  compiling: cuda-convnet"
    f_cc = theano.function([], y_cc)  # y_cc is already on the GPU

    print "  compiling: FFT"
    f_fft = theano.function([], gpu_from_host(y_fft))  # don't transfer to host

    print

    print "  verifying accuracy"
    # wrapping the function output in np.array causes a transfer to the host.
    out_theano = np.array(f_theano())
    out_cc = np.array(f_cc())
    out_fft = np.array(f_fft())
Example #33
0
def compileModel(data,
                 nInputs,
                 nOutputs,
                 hiddenLayersSize=[1200, 1200],
                 dropoutRates=[0.2, 0.5, 0.5],
                 activation='relu',
                 weightInitMode='normal',
                 regularizer=0.0001):
    """
    Creates a symbolic model given the specified parameters using Theano
    
    Output:
    A list containing three the training, validation and test compiled functions of Theano
    """

    np.random.seed(815)

    x = T.matrix('x')
    y = T.wvector('y')
    learningRate = T.scalar('learningRate')
    regularization = T.scalar('regularization')

    #Data sets
    train_x, train_y = data[0]
    valid_x, valid_y = data[1]
    test_x, test_y = data[2]

    nnet = MLP(x,
               nInputs,
               hiddenLayersSize,
               nOutputs,
               dropoutRates=dropoutRates,
               activation=activation,
               weightInitMode=weightInitMode)

    loss = nnet.loss(y, regularization)
    error = nnet.error(y)

    gParams = T.grad(loss, nnet.params)

    weightUpdates = [(param, param - learningRate * gParam)
                     for param, gParam in zip(nnet.params, gParams)]

    batchIndicesVecctor = T.ivector('batchIndicesVecctor')
    trainF = function([batchIndicesVecctor, learningRate, regularization],
                      Out(sbasic.gpu_from_host(loss), borrow=True),
                      updates=weightUpdates,
                      givens={
                          x: train_x[batchIndicesVecctor],
                          y: train_y[batchIndicesVecctor]
                      })
    validF = function([batchIndicesVecctor],
                      Out(sbasic.gpu_from_host(T.cast(error, T.config.floatX)),
                          borrow=True),
                      givens={
                          x: valid_x[batchIndicesVecctor],
                          y: valid_y[batchIndicesVecctor]
                      })
    testF = function([batchIndicesVecctor],
                     Out(sbasic.gpu_from_host(T.cast(error, T.config.floatX)),
                         borrow=True),
                     givens={
                         x: test_x[batchIndicesVecctor],
                         y: test_y[batchIndicesVecctor]
                     })

    return [trainF, validF, testF]
Example #34
0
from theano import function, config, shared, sandbox, tensor, Out
import theano
import numpy
import time
from theano.sandbox.cuda.basic_ops import gpu_from_host
vlen = 10 * 30 * 768  # 10 x # cores x # threads per core
iters = 1000
#http://deeplearning.net/software/theano/tutorial/aliasing.html
rng = numpy.random.RandomState(22)

x = shared(numpy.asarray(rng.rand(vlen), theano.config.floatX))

f1 = function([], gpu_from_host(tensor.exp(x)))
f2 = function([],
              Out(gpu_from_host(tensor.exp(x)),
                  borrow=True))
t0 = time.time()
for i in xrange(iters):
    r = f1()
t1 = time.time()
no_borrow = t1 - t0
t0 = time.time()
for i in xrange(iters):
    r = f2()
t1 = time.time()
print 'Looping', iters, 'times took', no_borrow, 'seconds without borrow',
print 'and', t1 - t0, 'seconds with borrow.'
if numpy.any([isinstance(x.op, tensor.Elemwise) and
              ('Gpu' not in type(x.op).__name__)
              for x in f1.maker.fgraph.toposort()]):
    print 'Used the cpu'
    w_val = np.random.randn(*shape_w).astype(theano.config.floatX) * std

    x.set_value(x_val)
    w.set_value(w_val)
    x_cc.set_value(x_val.transpose(1, 2, 3, 0)) # cuda-convnet expects the batch size in the trailing dimension.
    w_cc.set_value(w_val[:, :, ::-1, ::-1].transpose(1, 2, 3, 0)) # cuda-convnet doesn't flip the filters,
    # trailing dimension should be number of output channels.
    # by doing these transformations in advance on the host, these differences
    # cannot affect running times of the convolutions themselves.

    y_theano = conv.conv2d(x, w, image_shape=shape_x, filter_shape=shape_w)
    y_cc = filter_acts_op(x_cc, w_cc)
    y_fft = fftconv.conv2d_fft(x, w, image_shape=shape_x, filter_shape=shape_w)

    print "  compiling: Theano"
    f_theano = theano.function([], gpu_from_host(y_theano)) # don't transfer to host

    print "  compiling: cuda-convnet"
    f_cc = theano.function([], y_cc) # y_cc is already on the GPU

    print "  compiling: FFT"
    f_fft = theano.function([], gpu_from_host(y_fft)) # don't transfer to host

    print

    print "  verifying accuracy"
    # wrapping the function output in np.array causes a transfer to the host.
    out_theano = np.array(f_theano())
    out_cc = np.array(f_cc())
    out_fft = np.array(f_fft())
Example #36
0
 def oneStep(gfs_tm2, gfs_tm1, gfs_t, pm25_tm2, pm25_tm1, *prev_hiddens):
     input_x = cu.gpu_from_host(T.concatenate([gfs_tm2, gfs_tm1, gfs_t, pm25_tm2, pm25_tm1], axis=0))
     new_states = self.model.forward(input_x, prev_hiddens)
     # 错位之后返回
     return [new_states[-1]] + new_states[:-1]
Example #37
0
def safe_to_gpu(x):
    if isinstance(x.type, T.TensorType):
        return gpu_from_host(x)
    else:
        return x
Example #38
0
 def create_cost_fun(self):
     # 可能改cost function,记得
     self.cost = cu.gpu_from_host((self.predictions - self.pm25target).norm(L=2) / self.steps)
Example #39
0
def safe_to_gpu(x):
    if isinstance(x.type, T.TensorType):
        return gpu_from_host(x)
    else:
        return x
Example #40
0
 def create_valid_error(self):
     self.valid_error = cu.gpu_from_host(T.abs_(self.predictions - self.pm25target))
Example #41
0
p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b)) # Probability of having a one
prediction = p_1 > 0.5 # The prediction that is done: 0 or 1
xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) # Cross-entropy
# prediction = theano.Out(cuda.gpu_from_host(T.cast(p_1 > 0.5,theano.config.floatX)),borrow=True) # The prediction that is done: 0 or 1
# xent = cuda.gpu_from_host(T.cast(-y*T.log(p_1) - (1-y)*T.log(1-p_1),theano.config.floatX)) # Cross-entropy
cost = xent.mean() + 0.01*(w**2).sum() # The cost to optimize
gw,gb = T.grad(cost, [w,b])
# cost = cuda.gpu_from_host(xent.mean() + 0.01*(w**2).sum()) # The cost to optimize
# gw,gb = cuda.gpu_from_host(T.grad(cost, [w,b]))


# Compile expressions to functions
train = theano.function(
            inputs=[],
            outputs=[theano.Out(cuda.gpu_from_host(T.cast(prediction,theano.config.floatX)),borrow=True), 
                theano.Out(cuda.gpu_from_host(T.cast(xent,theano.config.floatX)),borrow=True)],
            updates=[(w, w-0.01*gw), (b, b-0.01*gb)],
            name = "train")
predict = theano.function(inputs=[], outputs=theano.Out(cuda.gpu_from_host(T.cast(prediction,theano.config.floatX)),borrow=True),
            name = "predict")

if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in
        train.maker.fgraph.toposort()]):
    print('Used the cpu')
elif any([x.op.__class__.__name__ in ['GpuGemm', 'GpuGemv'] for x in
          train.maker.fgraph.toposort()]):
    print('Used the gpu')
else:
    print('ERROR, not able to tell if theano used the cpu or the gpu')
    print(train.maker.fgraph.toposort())