def local_gpu_multinomial(node): if type(node.op) is MultinomialFromUniform: p, u = node.inputs m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([ i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs ])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [ host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T ] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner p, u = multi.inputs m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T # The dimshuffle is on the cpu, but will be moved to the gpu by an opt. return [gpu_from_host(ret)]
def parse_args(self, bottom, top): function_str = self.pythonargs[0] top_shape = self.pythonargs[1] old_function_str = self.function_str old_top_shape = self.top_shape self.function_str = function_str self.top_shape = top_shape if function_str != old_function_str or len(top_shape) != len( old_top_shape): if old_function_str != '': print( 'TheanoGPU function string different from cache: recompiling' ) import theano.tensor as T import theano from theano.sandbox.cuda.basic_ops import gpu_from_host x = [] for i in range(len(bottom)): if len(bottom[i].shape) == 1: x.append(T.vector('x%d' % i)) if len(bottom[i].shape) == 2: x.append(T.matrix('x%d' % i)) if len(bottom[i].shape) == 3: x.append(T.tensor3('x%d' % i)) if len(bottom[i].shape) == 4: x.append(T.tensor4('x%d' % i)) y = eval(function_str) self.f = theano.function(x, gpu_from_host(y), on_unused_input='ignore') if len(self.top_shape) == 1: v = T.vector('v') elif len(self.top_shape) == 2: v = T.matrix('v') elif len(self.top_shape) == 3: v = T.tensor3('v') elif len(self.top_shape) == 4: v = T.tensor4('v') self.b = [] for i in range(len(bottom)): yg = T.Lop(y, x[i], v) self.b.append( theano.function(x + [v], gpu_from_host(yg), on_unused_input='ignore'))
def use_gpu_cumsum(node): if type(node.op) is CumOp \ and node.inputs[0].dtype == 'float32' \ and node.inputs[0].owner \ and isinstance(node.inputs[0].owner.op, HostFromGpu): if node.op.mode != 'add': return None axis = node.op.axis x = node.inputs[0] if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS: return None x = gpu_from_host(x) if axis is None and x.ndim > 1: x = gpu_flatten(x) # ``gpu_cumsum`` assume array has been flattened if needed. if axis is None: axis = 0 ret = host_from_gpu(GpuCumsum(axis)(x)) ret.tag.values_eq_approx = values_eq_approx_high_tol return [ret]
def local_gpu_minres(node): if isinstance(node.op, MinresQLP): sw = False for inp in node.inputs: if inp.owner and inp.owner.op == host_from_gpu: sw = True if sw: inps = node.inputs nw_inps = [] for inp in inps: if not isinstance(inp.type, CudaNdarrayType): nw_inps.append(gpu_from_host(inp)) else: nw_inps.append(inp) new_op = node.op new_op.gpu = 1 _new_outs = node.op(*nw_inps) new_outs = [] for out in _new_outs: if isinstance(out.type, CudaNdarrayType): new_outs.append(host_from_gpu(out)) else: new_outs.append(out) return new_outs else: return False
def ctc_cost(acts, labels, input_lengths=None): """ Given sequences of output layer activations and labels, compute the softmax output at each timestep, and then compute the CTC cost of each sequence with respect to its corresponding label sequence. :param acts: Tensor of pre-softmax activations, with shape=[maxInputSeqLen, batchSize, targetN], where maxInputSeqLen >= the length of the longest input sequence. batchSize is the number of sequences being simultaneously computed / trained. targetN is the number of network outputs (<blank> is always target 0). :param labels: Matrix of training labels, with shape=[batchSize, maxOutputSeqLen]. Since <blank> is always output 0, labels should be > 0 (targets) or negative (ignored). maxOutputSeqLen >= the length of the longest target sequence (excluding <blank>s, which CTC alignment adds). Label values < 0 at any location are ignored, so [1], [-1, 1, -1], and [-1, -1, 1] are treated the same. :param input_lengths: Vector of input sequence lengths, with shape=[batchSize]. For sequence s (0 <= s < batchSize), CTC is calculated on acts[0:input_lengths[s], s, :]. If input_lengths is None, then all sequences in the batch are assumed to have length maxInputSeqLen. :return: Vector of CTC costs, with shape=[batchSize] """ # This should be properly integrated into the theano optimization catalog. # Until then, this forces the choice based on device configuration. if theano.config.device.startswith( "gpu") or theano.sandbox.cuda.cuda_enabled: if not isinstance(acts.type, CudaNdarrayType): # if not already on the device acts = gpu_from_host(acts) # this should get optimized away return GpuCtc()(acts, labels, input_lengths) else: return CpuCtc()(acts, labels, input_lengths)
def use_gpu_images2neibs(node): if type(node.op) is Images2Neibs: return [ host_from_gpu( gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode) ) ]
def use_gpu_images2neibs(node): if (type(node.op) is Images2Neibs and node.inputs[0].dtype == 'float32' and node.op.mode in ['valid', 'wrap_centered']): return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode))]
def __init__(self, **kwargs): self.num_layers = kwargs.get('num_layers', None) self.num_filters = kwargs.get('num_filters', None) self.filter_size = kwargs.get('filter_size', None) self.rng = kwargs.get('rng', np.random.RandomState(42)) self.load_folder = kwargs.get('weights_folder', None) self.activation = kwargs.get('activation', 'relu') self.cost_func = kwargs.get('cost_func', 'MSE') #Initialize (or load) the weights for the network if(self.load_folder == None): try: assert (self.num_layers != None) and (self.num_filters != None) and (self.filter_size != None) self.__define_network() self.__init_weights() except: print "ERROR: Insufficient parameters for generating new network" sys.exit(0) else: self.__load_weights() #Input and Target variables for symbolic representation of network self.X = T.tensor4('X') #Create the network model self.__model() if(theano.config.device == 'cpu'): #Create a predicter based on this network model self.forward = theano.function(inputs=[self.X], outputs=self.out, allow_input_downcast=True) else: #Create a predicter based on this network model self.forward = theano.function(inputs=[self.X], outputs=Out(gpu_from_host(self.out), borrow=True), allow_input_downcast=True)
def use_gpu_images2neibs(node): if (type(node.op) is Images2Neibs and node.inputs[0].dtype == 'float32' and node.op.mode in ['valid', 'ignore_borders', 'wrap_centered']): return [host_from_gpu(gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode))]
def ctc_cost(acts, input_lengths, flat_labels, label_lengths): # This should be properly integrated into the theano optimization catalog. # Until then, this forces the choice based on device configuration. if theano.config.device.startswith("gpu") or theano.sandbox.cuda.cuda_enabled: if not isinstance(acts.type, CudaNdarrayType): # if not already on the device acts = gpu_from_host(acts) # this should get optimized away return gpu_ctc_cost(acts, input_lengths, flat_labels, label_lengths) else: return cpu_ctc_cost(acts, input_lengths, flat_labels, label_lengths)
def use_gpu_images2neibs(node): if type(node.op) is Images2Neibs: return [ host_from_gpu( gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode)) ]
def local_gpu_multinomial(node): # TODO : need description for function if type(node.op) is MultinomialFromUniform: if len(node.inputs) == 2: p, u = node.inputs n_samples = 1 else: p, u, n_samples = node.inputs try: if get_scalar_constant_value(n_samples) != 1: return None except NotScalarConstantError: return None m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([ i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs ])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [ host_from_gpu(gpu_op(*[gpu_from_host(i) for i in [p, u]])).T ] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner if len(node.inputs) == 2: p, u = node.inputs n_samples = 1 else: p, u, n_samples = node.inputs try: if get_scalar_constant_value(n_samples) != 1: return None except NotScalarConstantError: return None m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T # The dimshuffle is on the cpu, but will be moved to the # gpu by an opt. return [gpu_from_host(ret)]
def local_gpu_multinomial(node): if type(node.op) is MultinomialFromUniform: p, u = node.inputs m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in node.inputs])).T] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner p, u = multi.inputs m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in multi.inputs]).T # The dimshuffle is on the cpu, but will be moved to the gpu by an opt. return [gpu_from_host(ret)]
def local_gpu_argmax(node): if type(node.op) is KArgmax: p, = node.inputs vals, indx, = node.outputs if (p.dtype == vals.dtype == 'float32' and any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])): gpu_op = GpuKArgmax(node.op.K) ret_vals, ret_indx = gpu_op(gpu_from_host(p)) return [host_from_gpu(ret_vals), T.cast(host_from_gpu(ret_indx), "int32")] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is KArgmax): multi = node.inputs[0].owner p, = multi.inputs vals, indx, = multi.outputs if (p.dtype == vals.dtype == 'float32'): gpu_op = GpuKArgmax(node.inputs[0].owner.op.K) ret_vals, ret_indx = gpu_op(gpu_from_host(p)) return [gpu_from_host(ret_vals), gpu_from_host(ret_indx)]
def local_assigner(node): if type(node.op) is Assigner: p, indx, gr, = node.inputs vals, = node.outputs if (p.dtype == vals.dtype == 'float32' and any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])): gpu_op = GpuAssigner() ret = gpu_op(gpu_from_host(p),indx,gpu_from_host(gr)) return [host_from_gpu(ret),] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is Assigner): multi = node.inputs[0].owner p,indx,gr = multi.inputs vals, = multi.outputs if (p.dtype == vals.dtype == 'float32'): gpu_op = GpuAssigner() ret_vals = gpu_op(gpu_from_host(p),indx,gpu_from_host(gr)) return [gpu_from_host(ret_vals)]
def local_gpu_multinomial(node): # TODO : need description for function if type(node.op) is MultinomialFromUniform: if len(node.inputs) == 2: p, u = node.inputs n_samples = 1 else: p, u, n_samples = node.inputs try: if get_scalar_constant_value(n_samples) != 1: return None except NotScalarConstantError: return None m, = node.outputs if (p.dtype == u.dtype == m.dtype == 'float32' and any([i.owner and isinstance(i.owner.op, theano.sandbox.cuda.HostFromGpu) for i in node.inputs])): gpu_op = GpuMultinomialFromUniform(node.op.odtype) return [host_from_gpu(gpu_op(*[gpu_from_host(i) for i in [p, u]])).T] if (isinstance(node.op, theano.sandbox.cuda.GpuFromHost) and node.inputs[0].owner and type(node.inputs[0].owner.op) is MultinomialFromUniform): multi = node.inputs[0].owner if len(node.inputs) == 2: p, u = node.inputs n_samples = 1 else: p, u, n_samples = node.inputs try: if get_scalar_constant_value(n_samples) != 1: return None except NotScalarConstantError: return None m, = multi.outputs if (p.dtype == u.dtype == m.dtype == 'float32'): gpu_op = GpuMultinomialFromUniform(multi.op.odtype) ret = gpu_op(*[gpu_from_host(i) for i in [p, u]]).T # The dimshuffle is on the cpu, but will be moved to the # gpu by an opt. return [gpu_from_host(ret)]
def parse_args(self, bottom, top): function_str = self.pythonargs[0] top_shape = self.pythonargs[1] if self.function_str != function_str or self.top_shape != top_shape: self.function_str = function_str self.top_shape = top_shape import theano.tensor as T import theano from theano.sandbox.cuda.basic_ops import gpu_from_host x = [] for i in range(len(bottom)): if len(bottom[i].shape) == 1: x.append(T.vector('x%d' % i)) if len(bottom[i].shape) == 2: x.append(T.matrix('x%d' % i)) if len(bottom[i].shape) == 3: x.append(T.tensor3('x%d' % i)) if len(bottom[i].shape) == 4: x.append(T.tensor4('x%d' % i)) y = eval(function_str) self.f = theano.function(x, gpu_from_host(y), on_unused_input='ignore') if len(self.top_shape) == 1: v = T.vector('v') elif len(self.top_shape) == 2: v = T.matrix('v') elif len(self.top_shape) == 3: v = T.tensor3('v') elif len(self.top_shape) == 4: v = T.tensor4('v') self.b = [] for i in range(len(bottom)): yg = T.Lop(y, x[i], v) self.b.append( theano.function(x + [v], gpu_from_host(yg), on_unused_input='ignore'))
def ctc_cost(acts, input_lengths, flat_labels, label_lengths): # This should be properly integrated into the theano optimization catalog. # Until then, this forces the choice based on device configuration. if theano.config.device.startswith( "gpu") or theano.sandbox.cuda.cuda_enabled: if not isinstance(acts.type, CudaNdarrayType): # if not already on the device acts = gpu_from_host(acts) # this should get optimized away return gpu_ctc_cost(acts, input_lengths, flat_labels, label_lengths) else: return cpu_ctc_cost(acts, input_lengths, flat_labels, label_lengths)
def use_gpu_images2neibs(node): if ( type(node.op) is Images2Neibs and node.inputs[0].dtype == "float32" and node.op.mode in ["valid", "ignore_borders", "wrap_centered"] ): return [ host_from_gpu( gpu_images2neibs(gpu_from_host(node.inputs[0]), node.inputs[1], node.inputs[2], mode=node.op.mode) ) ]
def parse_args(self, bottom, top): function_str = self.pythonargs[0] top_shape = self.pythonargs[1] old_function_str = self.function_str old_top_shape = self.top_shape self.function_str = function_str self.top_shape = top_shape if function_str != old_function_str or len(top_shape) != len(old_top_shape): if old_function_str != '': print('TheanoGPU function string different from cache: recompiling') import theano.tensor as T import theano from theano.sandbox.cuda.basic_ops import gpu_from_host x = [] for i in range(len(bottom)): if len(bottom[i].shape) == 1: x.append(T.vector('x%d' % i)) if len(bottom[i].shape) == 2: x.append(T.matrix('x%d' % i)) if len(bottom[i].shape) == 3: x.append(T.tensor3('x%d' % i)) if len(bottom[i].shape) == 4: x.append(T.tensor4('x%d' % i)) y = eval(function_str) self.f = theano.function(x, gpu_from_host(y), on_unused_input='ignore') if len(self.top_shape) == 1: v = T.vector('v') elif len(self.top_shape) == 2: v = T.matrix('v') elif len(self.top_shape) == 3: v = T.tensor3('v') elif len(self.top_shape) == 4: v = T.tensor4('v') self.b = [] for i in range(len(bottom)): yg = T.Lop(y, x[i], v) self.b.append(theano.function(x + [v], gpu_from_host(yg), on_unused_input='ignore'))
def compileModel(data, nInputs, nOutputs, hiddenLayersSize = [1200, 1200], dropoutRates = [0.2, 0.5, 0.5], activation = 'relu', weightInitMode = 'normal', regularizer = 0.0001): """ Creates a symbolic model given the specified parameters using Theano Output: A list containing three the training, validation and test compiled functions of Theano """ np.random.seed(815) x = T.matrix('x') y = T.wvector('y') learningRate = T.scalar('learningRate') regularization = T.scalar('regularization') #Data sets train_x, train_y = data[0] valid_x, valid_y = data[1] test_x, test_y = data[2] nnet = MLP(x, nInputs, hiddenLayersSize, nOutputs, dropoutRates = dropoutRates, activation = activation, weightInitMode = weightInitMode) loss = nnet.loss(y, regularization) error = nnet.error(y) gParams = T.grad(loss, nnet.params) weightUpdates = [(param, param - learningRate * gParam) for param, gParam in zip(nnet.params, gParams)] batchIndicesVecctor = T.ivector('batchIndicesVecctor') trainF = function([batchIndicesVecctor, learningRate, regularization], Out(sbasic.gpu_from_host(loss), borrow = True), updates = weightUpdates, givens = {x: train_x[batchIndicesVecctor], y: train_y[batchIndicesVecctor]}) validF = function([batchIndicesVecctor], Out(sbasic.gpu_from_host(T.cast(error, T.config.floatX)), borrow = True), givens = {x: valid_x[batchIndicesVecctor], y: valid_y[batchIndicesVecctor]}) testF = function([batchIndicesVecctor], Out(sbasic.gpu_from_host(T.cast(error, T.config.floatX)), borrow = True), givens = {x: test_x[batchIndicesVecctor], y: test_y[batchIndicesVecctor]}) return [trainF, validF, testF]
def parse_args(self, bottom, top): function_str = self.pythonargs[0] top_shape = self.pythonargs[1] if self.function_str != function_str or self.top_shape != top_shape: self.function_str = function_str self.top_shape = top_shape import theano.tensor as T import theano from theano.sandbox.cuda.basic_ops import gpu_from_host x = [] for i in range(len(bottom)): if len(bottom[i].shape) == 1: x.append(T.vector('x%d' % i)) if len(bottom[i].shape) == 2: x.append(T.matrix('x%d' % i)) if len(bottom[i].shape) == 3: x.append(T.tensor3('x%d' % i)) if len(bottom[i].shape) == 4: x.append(T.tensor4('x%d' % i)) y = eval(function_str) self.f = theano.function(x, gpu_from_host(y), on_unused_input='ignore') if len(self.top_shape) == 1: v = T.vector('v') elif len(self.top_shape) == 2: v = T.matrix('v') elif len(self.top_shape) == 3: v = T.tensor3('v') elif len(self.top_shape) == 4: v = T.tensor4('v') self.b = [] for i in range(len(bottom)): yg = T.Lop(y, x[i], v) self.b.append(theano.function(x + [v], gpu_from_host(yg), on_unused_input='ignore'))
def grad_step(*args): idx = TT.cast(args[0], 'int32') nw_inps = [x[idx * options['cbs']: \ (idx + 1) * options['cbs']] for x in loc_inputs] replace = dict(zip(model.inputs, nw_inps)) nw_cost = safe_clone(model.train_cost, replace=replace) gs = TT.grad(nw_cost, model.params) nw_gs = [op + np for op, np in zip(args[2: 2 + n_params], gs)] _gs = [x for x in gs] _nw_gs = [gpu_from_host(g) for g in nw_gs] nw_gs = ifelse(comp_grad, _nw_gs, _gs, gpu=True) nw_gs = [x.type.filter_variable(y) for x,y in zip(args[2:],nw_gs)] return [args[0] + const(1), args[1] + nw_cost] + nw_gs
def test_gpualloc_output_to_gpu(): a_val = numpy.asarray(numpy.random.rand(4,5),dtype='float32') a = tcn.shared_constructor(a_val) b = T.fscalar() f = theano.function([b], T.ones_like(a)+b, mode=mode_without_gpu) f_gpu = theano.function([b], B.gpu_from_host(T.ones_like(a))+b, mode=mode_with_gpu) print f.maker.env.toposort() print f_gpu.maker.env.toposort() print f(2) print f_gpu(2) assert sum([node.op == T.alloc for node in f.maker.env.toposort()])==1 assert sum([node.op == B.gpu_alloc for node in f_gpu.maker.env.toposort()])==1 assert numpy.allclose(numpy.ones(a.get_value(borrow=True).shape)+9,f_gpu(9)) assert numpy.allclose(f(5),f_gpu(5))
def local_gpu_forloop(node): if isinstance(node.op, forloop): sw = False for inp in node.inputs: if inp.owner and inp.owner.op == host_from_gpu: sw = True if sw: inps = node.inputs nw_inps = [] for inp in inps: if not isinstance(inp.type, CudaNdarrayType): nw_inps.append(gpu_from_host(inp)) else: nw_inps.append(inp) new_outs = node.op(*nw_inps) return [host_from_gpu(x) for x in new_outs] else: return False
def use_gpu_cumsum(node): if type(node.op) is CumsumOp \ and node.inputs[0].dtype == 'float32' \ and node.inputs[0].owner \ and isinstance(node.inputs[0].owner.op, HostFromGpu): axis = node.op.axis x = node.inputs[0] if axis is not None and x.ndim > GpuCumsum.SUPPORTED_NDIMS: return None x = gpu_from_host(x) if axis is None and x.ndim > 1: x = GpuFlatten()(x) # ``gpu_cumsum`` assume array has been flattened if needed. if axis is None: axis = 0 return [host_from_gpu(GpuCumsum(axis)(x))]
filter_shape = (64, 8, 3, 3) padding = "valid" # (1, 1) strides = (1, 1) # input_shape = (32, 16, 48, 48) # filter_shape = (24, 16, 3, 3) # padding = (1, 1) # strides = (1, 1) print "fprop" x = theano.shared(np.random.normal(0, 1, input_shape).astype(theano.config.floatX)) w = theano.shared(np.random.normal(0, 1, filter_shape).astype(theano.config.floatX)) y_cudnn = dnn.dnn_conv(x, w, border_mode=padding, subsample=strides, conv_mode='cross') y_nervana_raw = nervana_conv(x, w, padding=padding, strides=strides) y_nervana = gpu_from_host(y_nervana_raw) val_cudnn = np.array(y_cudnn.eval()) val_nervana = np.array(y_nervana.eval()) assert np.allclose(val_cudnn, val_nervana) print "fprop without dimshuffle" x_nodimshuffle = theano.shared(x.get_value().transpose(1, 2, 3, 0)) # c01b w_nodimshuffle = theano.shared(w.get_value().transpose(1, 2, 3, 0)) # c01b y_nervana_nodimshuffle = gpu_from_host(nervana_conv(x_nodimshuffle, w_nodimshuffle, padding=padding, strides=strides, dimshuffle=False)) val_nervana_nodimshuffle = np.array(y_nervana_nodimshuffle.eval()).transpose(3, 0, 1, 2) assert np.allclose(val_nervana, val_nervana_nodimshuffle)
def train_nn(data_file_name, reg_lambda=0.01, learning_rate=0.01, n_eigs=100, n_neurons_per_layer=100, batch_size=100, display=True): train_data, test_data, file_names = old_load_images(data_file_name) eig_face = EigenFace.from_file(train_data[0], data_file_name, n_eigs) train_data[0] = get_face_space(data_file_name, 'train_x', train_data[0], eig_face) test_data[0] = get_face_space(data_file_name, 'test_x', test_data[0], eig_face) n_features, n_training_examples = train_data[0].shape real_scores = test_data[1].T.tolist() train_data = to_theano_shared(train_data) test_data = to_theano_shared(test_data) rng = numpy.random.RandomState(1234) x = T.matrix('x') y = T.vector('y') mlp = MLP(rng, x, n_features, n_neurons_per_layer, n_training_examples) cost = mlp.cost(y) + reg_lambda * mlp.L2_sqr test_model =theano.function([], outputs=[cost, mlp.output], givens={x:test_data[0][:], y:test_data[1][:]}) g_params = [] for param in mlp.params: g_param = T.grad(cost, param) g_params.append(g_param) updates = {} for param, g_param in zip(mlp.params, g_params): updates[param] = param - learning_rate * g_param train_model = theano.function([], outputs=theano.Out(gpu_from_host(cost), borrow=True), updates=updates, givens={x:train_data[0][:], y:train_data[1][:]}) current_cost = numpy.asarray(train_model()) logging.info('initial cost %f' % current_cost) old_cost = 0 iterations = 0 logging.info('beginning stochastic gradient descent') while ((abs(current_cost- old_cost)) > 0.001): old_cost = current_cost current_cost = numpy.asarray(train_model()) if iterations % 10 == 0: logging.info('iteration % 9d cost % 9f' % (iterations, current_cost)) iterations += 1 error, predictions = test_model() # Print the results logging.info('training cost minimised: %f' % current_cost) logging.info('test error: %f' % error) predictions = predictions[0].tolist() logging.debug('predictions %s', str(predictions)) pearsons = pearsonr(real_scores, predictions) logging.info('pearsons correlation: %f, %f' % pearsons) # Save our weights should we ever need them again plot_title_data = (n_neurons_per_layer, learning_rate, reg_lambda, pearsons[0]) plot_correlation(real_scores, predictions, file_names, 'neural network with %d neurons' \ 'learning rate %f and reg-lambda %f pearsons %f' % plot_title_data, 'nn', show=True, pearsons=pearsons)
# padding = (1, 1) # strides = (1, 1) print "fprop" x = theano.shared( np.random.normal(0, 1, input_shape).astype(theano.config.floatX)) w = theano.shared( np.random.normal(0, 1, filter_shape).astype(theano.config.floatX)) y_cudnn = dnn.dnn_conv(x, w, border_mode=padding, subsample=strides, conv_mode='cross') y_nervana_raw = nervana_conv(x, w, padding=padding, strides=strides) y_nervana = gpu_from_host(y_nervana_raw) val_cudnn = np.array(y_cudnn.eval()) val_nervana = np.array(y_nervana.eval()) assert np.allclose(val_cudnn, val_nervana) print "fprop without dimshuffle" x_nodimshuffle = theano.shared(x.get_value().transpose(1, 2, 3, 0)) # c01b w_nodimshuffle = theano.shared(w.get_value().transpose(1, 2, 3, 0)) # c01b y_nervana_nodimshuffle = gpu_from_host( nervana_conv(x_nodimshuffle, w_nodimshuffle, padding=padding, strides=strides,
x_cc.set_value(x_val.transpose( 1, 2, 3, 0)) # cuda-convnet expects the batch size in the trailing dimension. w_cc.set_value(w_val[:, :, ::-1, ::-1].transpose( 1, 2, 3, 0)) # cuda-convnet doesn't flip the filters, # trailing dimension should be number of output channels. # by doing these transformations in advance on the host, these differences # cannot affect running times of the convolutions themselves. y_theano = conv.conv2d(x, w, image_shape=shape_x, filter_shape=shape_w) y_cc = filter_acts_op(x_cc, w_cc) y_fft = fftconv.conv2d_fft(x, w, image_shape=shape_x, filter_shape=shape_w) print " compiling: Theano" f_theano = theano.function( [], gpu_from_host(y_theano)) # don't transfer to host print " compiling: cuda-convnet" f_cc = theano.function([], y_cc) # y_cc is already on the GPU print " compiling: FFT" f_fft = theano.function([], gpu_from_host(y_fft)) # don't transfer to host print print " verifying accuracy" # wrapping the function output in np.array causes a transfer to the host. out_theano = np.array(f_theano()) out_cc = np.array(f_cc()) out_fft = np.array(f_fft())
def compileModel(data, nInputs, nOutputs, hiddenLayersSize=[1200, 1200], dropoutRates=[0.2, 0.5, 0.5], activation='relu', weightInitMode='normal', regularizer=0.0001): """ Creates a symbolic model given the specified parameters using Theano Output: A list containing three the training, validation and test compiled functions of Theano """ np.random.seed(815) x = T.matrix('x') y = T.wvector('y') learningRate = T.scalar('learningRate') regularization = T.scalar('regularization') #Data sets train_x, train_y = data[0] valid_x, valid_y = data[1] test_x, test_y = data[2] nnet = MLP(x, nInputs, hiddenLayersSize, nOutputs, dropoutRates=dropoutRates, activation=activation, weightInitMode=weightInitMode) loss = nnet.loss(y, regularization) error = nnet.error(y) gParams = T.grad(loss, nnet.params) weightUpdates = [(param, param - learningRate * gParam) for param, gParam in zip(nnet.params, gParams)] batchIndicesVecctor = T.ivector('batchIndicesVecctor') trainF = function([batchIndicesVecctor, learningRate, regularization], Out(sbasic.gpu_from_host(loss), borrow=True), updates=weightUpdates, givens={ x: train_x[batchIndicesVecctor], y: train_y[batchIndicesVecctor] }) validF = function([batchIndicesVecctor], Out(sbasic.gpu_from_host(T.cast(error, T.config.floatX)), borrow=True), givens={ x: valid_x[batchIndicesVecctor], y: valid_y[batchIndicesVecctor] }) testF = function([batchIndicesVecctor], Out(sbasic.gpu_from_host(T.cast(error, T.config.floatX)), borrow=True), givens={ x: test_x[batchIndicesVecctor], y: test_y[batchIndicesVecctor] }) return [trainF, validF, testF]
from theano import function, config, shared, sandbox, tensor, Out import theano import numpy import time from theano.sandbox.cuda.basic_ops import gpu_from_host vlen = 10 * 30 * 768 # 10 x # cores x # threads per core iters = 1000 #http://deeplearning.net/software/theano/tutorial/aliasing.html rng = numpy.random.RandomState(22) x = shared(numpy.asarray(rng.rand(vlen), theano.config.floatX)) f1 = function([], gpu_from_host(tensor.exp(x))) f2 = function([], Out(gpu_from_host(tensor.exp(x)), borrow=True)) t0 = time.time() for i in xrange(iters): r = f1() t1 = time.time() no_borrow = t1 - t0 t0 = time.time() for i in xrange(iters): r = f2() t1 = time.time() print 'Looping', iters, 'times took', no_borrow, 'seconds without borrow', print 'and', t1 - t0, 'seconds with borrow.' if numpy.any([isinstance(x.op, tensor.Elemwise) and ('Gpu' not in type(x.op).__name__) for x in f1.maker.fgraph.toposort()]): print 'Used the cpu'
w_val = np.random.randn(*shape_w).astype(theano.config.floatX) * std x.set_value(x_val) w.set_value(w_val) x_cc.set_value(x_val.transpose(1, 2, 3, 0)) # cuda-convnet expects the batch size in the trailing dimension. w_cc.set_value(w_val[:, :, ::-1, ::-1].transpose(1, 2, 3, 0)) # cuda-convnet doesn't flip the filters, # trailing dimension should be number of output channels. # by doing these transformations in advance on the host, these differences # cannot affect running times of the convolutions themselves. y_theano = conv.conv2d(x, w, image_shape=shape_x, filter_shape=shape_w) y_cc = filter_acts_op(x_cc, w_cc) y_fft = fftconv.conv2d_fft(x, w, image_shape=shape_x, filter_shape=shape_w) print " compiling: Theano" f_theano = theano.function([], gpu_from_host(y_theano)) # don't transfer to host print " compiling: cuda-convnet" f_cc = theano.function([], y_cc) # y_cc is already on the GPU print " compiling: FFT" f_fft = theano.function([], gpu_from_host(y_fft)) # don't transfer to host print print " verifying accuracy" # wrapping the function output in np.array causes a transfer to the host. out_theano = np.array(f_theano()) out_cc = np.array(f_cc()) out_fft = np.array(f_fft())
def oneStep(gfs_tm2, gfs_tm1, gfs_t, pm25_tm2, pm25_tm1, *prev_hiddens): input_x = cu.gpu_from_host(T.concatenate([gfs_tm2, gfs_tm1, gfs_t, pm25_tm2, pm25_tm1], axis=0)) new_states = self.model.forward(input_x, prev_hiddens) # 错位之后返回 return [new_states[-1]] + new_states[:-1]
def safe_to_gpu(x): if isinstance(x.type, T.TensorType): return gpu_from_host(x) else: return x
def create_cost_fun(self): # 可能改cost function,记得 self.cost = cu.gpu_from_host((self.predictions - self.pm25target).norm(L=2) / self.steps)
def create_valid_error(self): self.valid_error = cu.gpu_from_host(T.abs_(self.predictions - self.pm25target))
p_1 = 1 / (1 + T.exp(-T.dot(x, w)-b)) # Probability of having a one prediction = p_1 > 0.5 # The prediction that is done: 0 or 1 xent = -y*T.log(p_1) - (1-y)*T.log(1-p_1) # Cross-entropy # prediction = theano.Out(cuda.gpu_from_host(T.cast(p_1 > 0.5,theano.config.floatX)),borrow=True) # The prediction that is done: 0 or 1 # xent = cuda.gpu_from_host(T.cast(-y*T.log(p_1) - (1-y)*T.log(1-p_1),theano.config.floatX)) # Cross-entropy cost = xent.mean() + 0.01*(w**2).sum() # The cost to optimize gw,gb = T.grad(cost, [w,b]) # cost = cuda.gpu_from_host(xent.mean() + 0.01*(w**2).sum()) # The cost to optimize # gw,gb = cuda.gpu_from_host(T.grad(cost, [w,b])) # Compile expressions to functions train = theano.function( inputs=[], outputs=[theano.Out(cuda.gpu_from_host(T.cast(prediction,theano.config.floatX)),borrow=True), theano.Out(cuda.gpu_from_host(T.cast(xent,theano.config.floatX)),borrow=True)], updates=[(w, w-0.01*gw), (b, b-0.01*gb)], name = "train") predict = theano.function(inputs=[], outputs=theano.Out(cuda.gpu_from_host(T.cast(prediction,theano.config.floatX)),borrow=True), name = "predict") if any([x.op.__class__.__name__ in ['Gemv', 'CGemv', 'Gemm', 'CGemm'] for x in train.maker.fgraph.toposort()]): print('Used the cpu') elif any([x.op.__class__.__name__ in ['GpuGemm', 'GpuGemv'] for x in train.maker.fgraph.toposort()]): print('Used the gpu') else: print('ERROR, not able to tell if theano used the cpu or the gpu') print(train.maker.fgraph.toposort())