def pretraining_functions(self, train_set_x, batch_size, k): '''Generates a list of functions, for performing one step of gradient descent at a given layer. The function will require as input the minibatch index, and to train an RBM you just need to iterate, calling the corresponding function on all minibatch indexes. :type train_set_x: theano.tensor.TensorType :param train_set_x: Shared var. that contains all datapoints used for training the RBM :type batch_size: int :param batch_size: size of a [mini]batch :param k: number of Gibbs steps to do in CD-k / PCD-k ''' # index to a [mini]batch index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # begining of a batch, given `index` batch_begin = index * batch_size # ending of a batch given `index` batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) # compile the theano function fn = theano.function( inputs=[index, theano.In(learning_rate, value=0.1)], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin:batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns
def create_decoder_func(layers): Z = T.fmatrix('Z') Z_batch = T.fmatrix('Z_batch') X = get_output(layers['l_decoder_out'], inputs={layers['l_encoder_out']: Z}, deterministic=True) decoder_func = theano.function( inputs=[theano.In(Z_batch)], outputs=X, givens={ Z: Z_batch, }, ) return decoder_func
def pretraining_functions(self, train_set_x, batch_size, k): index = T.lscalar('index') learning_rate = T.scalar('lr') batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] for rbm in self.rbm_layers: cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) fn = theano.function( inputs=[index, theano.In(learning_rate, value=0.1)], outputs=cost, updates=updates, givens={self.x: train_set_x[batch_begin:batch_end]}) pretrain_fns.append(fn) return pretrain_fns
def test_vm_gc(): # This already caused a bug in the trunk of Theano. # # The bug was introduced in the trunk on July 5th, 2012 and fixed on # July 30th. x = theano.tensor.vector() p = RunOnce()(x) mode = theano.Mode(linker=theano.gof.vm.VM_Linker(lazy=True)) f = theano.function([theano.In(x, mutable=True)], [p + 1, p + 2], mode=mode) f([1, 2, 3]) p = RunOnce()(x) pp = p + p f = theano.function([x], [pp + pp], mode=mode) f([1, 2, 3])
def test_remove0(): print print 'test_remove0()' configs = [ # structure type, numpy matching class ('csc', scipy.sparse.csc_matrix), ('csr', scipy.sparse.csr_matrix), ] for format, matrix_class in configs: print 'config: format=\'%(format)s\', matrix_class=%(matrix_class)s' % locals( ) # real origin = (numpy.arange(9) + 1).reshape( (3, 3)).astype(theano.config.floatX) mat = matrix_class(origin).astype(theano.config.floatX) mat[0, 1] = mat[1, 0] = mat[2, 2] = 0 assert mat.size == 9 # symbolic x = theano.sparse.SparseType(format=format, dtype=theano.config.floatX)() # the In thingy has to be there because theano has as rule not to optimize inputs f = theano.function([theano.In(x, borrow=True, mutable=True)], sp.Remove0()(x)) # assert optimization is applied in modes with optimization if theano.config.mode not in ['FAST_COMPILE']: # list of apply nodes in the optimized graph. nodes = f.maker.env.toposort() v = [ True for node in nodes if isinstance(node.op, sp.Remove0) and node.op.inplace ] assert len(v), 'Inplacing optimization should have been applied.' # checking # makes sense to change its name target = mat result = f(mat) mat.eliminate_zeros() assert result.size == target.size, 'Matrices sizes differ. Have zeros been removed ?'
def test_partial_input_aliasing_affecting_inplace_operations(self): # Note: to trigger this bug with theano rev 4586:2bc6fc7f218b, # you need to make in inputs mutable ( so that inplace # operations are used) and to break the elemwise composition # with some non-elemwise op ( here dot ) x = theano.tensor.dvector() y = theano.tensor.dvector() z = theano.tensor.dvector() m1 = theano.tensor.dmatrix() m2 = theano.tensor.dmatrix() m3 = theano.tensor.dmatrix() # Test 2. If variables only partial overlap # more exactly we care about the case when we have a,b,c # and a shares memory with b, b shares memory with c, but # c does not share memory with a f = theano.function( [ theano.In(x, mutable=True), theano.In(y, mutable=True), theano.In(z, mutable=True), theano.In(m1, mutable=True), theano.In(m2, mutable=True), theano.In(m3, mutable=True), ], ( theano.tensor.dot((x * 2), m1) + theano.tensor.dot((y * 3), m2) + theano.tensor.dot((z * 4), m3) ), ) # Compute bogus values v = np.asarray([1, 2, 3, 4, 5], dtype="float64") m = np.asarray([[1, 0], [0, 1]], dtype="float64") bogus_vals = f(v[:2], v[1:3], v[2:4], m, m, m) # Since we used inplace operation v and m may be corrupted # so we need to recreate them v = np.asarray([1, 2, 3, 4, 5], dtype="float64") m = np.asarray([[1, 0], [0, 1]], dtype="float64") m_copy1 = m.copy() v_copy1 = v.copy() m_copy2 = m.copy() v_copy2 = v.copy() vals = f(v[:2], v_copy1[1:3], v_copy2[2:4], m, m_copy1, m_copy2) assert np.allclose(vals, bogus_vals)
def pretraining_functions(self, train_set_x, batch_size): batch_size = 1 n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size index = T.lscalar('index') learning_rate = T.scalar('lr') batch_begin = index * batch_size batch_end = batch_begin + batch_size pretrain_fns = [] z_outs = [] for ae in self.AE_layers: cost,updates,z = ae.get_cost_updates(learning_rate) fn = theano.function( inputs=[ index, theano.In(learning_rate, value=0.1) ], outputs=cost, updates=updates, givens = {self.x: train_set_x[batch_begin: batch_end]}) pretrain_fns.append(fn) z_out = ae.get_reconstructed_input( self.sigmoid_layers[-1].output ) fn2 = theano.function( inputs=[self.sigmoid_layers[-1].output], outputs=z_out, on_unused_input='ignore', givens = {self.x: train_set_x[batch_begin: batch_end]} ) z_outs.append(fn2) return pretrain_fns, z_outs
def __make_train_function(self): if not hasattr(self, 'train_function'): raise Exception('Model should be compiled before training') if self.train_function is None: print >> sys.stderr, 'Compile training function' input_vars = list() input_vars.extend(self.inputs) input_vars.append(theano.In(self.is_training, value=1)) if isinstance(self.outputs, dict): output_vars = dict() output_vars['loss'] = self.cost output_vars.update(self.outputs) else: output_vars = list() output_vars.append(self.cost) output_vars.extend(self.outputs) self.train_function = theano.function(input_vars, output_vars, updates=self.updates, on_unused_input='ignore') return self.train_function
def __init__(self, tt_input, tt_output, updates=None, name='Unnamed Function', borrow_inp=False, borrow_out=False, profile_execution=False): self.name = name self.func = None self.profile = profile_execution self.last_exec_time = None self.updates = updates if borrow_inp: tt_input = [theano.In(x, borrow=True) for x in tt_input] self.tt_input = tt_input self.single_return = False if not isinstance(tt_output, (list, tuple)): tt_output = [tt_output,] self.single_return = True if borrow_out: tt_output = [theano.Out(x, borrow=True) for x in tt_output] self.tt_output = tt_output
def reconstruction_loss(layer_dict): # Symbolic var for learning rate lr = T.scalar('lr') # Symbolic input variable input_var = T.fmatrix('input_var') # Symbolic mini batch variable batch = T.fmatrix('batch') # Get reconstructed input from AE reconstruction = ll.get_output( layer_dict['AAE_Output'], input_var, deterministic=False) # MSE between real input and reconstructed input recon_loss = T.mean(T.mean(T.sqr(input_var - reconstruction), axis=1)) # Update trainable parameters of AE recon_params = ll.get_all_params(layer_dict['AAE_Output'], trainable=True) recon_updates = lasagne.updates.nesterov_momentum( recon_loss, recon_params, learning_rate=lr, momentum=0.9) # Reconstruction loss a.k.a Lrecon recon_func = theano.function(inputs=[theano.In(batch), lr], outputs=recon_loss, updates=recon_updates, givens={input_var: batch} ) return recon_func
def pretraining_functions(self, train_set_x, train_set_y, batch_size, k): index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use pt_learning_rate = theano.shared(value=np.asarray( 0.1, dtype=theano.config.floatX), borrow=True) update_ptlr = theano.function(inputs=[], outputs=pt_learning_rate, updates={ pt_learning_rate: T.clip(pt_learning_rate * 0.999, 0.1 / batch_size * 0.01, 1) }) cost, updates = self.get_cost_updates(learning_rate, persistent=None, k=k) fn = theano.function( inputs=[ index, theano.In(learning_rate, value=pt_learning_rate.get_value()) ], outputs=cost, updates=updates, givens={ self.input: train_set_y[(index * batch_size):(index * batch_size + batch_size)], self.input_context: train_set_x[(index * batch_size):(index * batch_size + batch_size)] }) return fn, update_ptlr
def pretrain_setup(self, train_set_x, batch_size, k): index = T.lscalar('index') # index to a minibatch learning_rate = T.scalar('lr') # learning rate to use # number of batches n_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size # begining of a batch, given `index` batch_begin = index * int(batch_size/4) # ending of a batch given `index` batch_end = batch_begin + int(batch_size/4) pretrain_fns = [] for rbm in self.rbm_layers: # get the cost and the updates list # using CD-k here (persisent=None) for training each RBM. # TODO: change cost function to reconstruction error persistent_chain = theano.shared(numpy.zeros((batch_size, rbm.n_hidden), dtype=theano.config.floatX)) cost, updates = rbm.get_cost_updates(learning_rate, persistent=None, k=k) #persisetnet=None # compile the theano function fn = theano.function( inputs=[index, theano.In(learning_rate, value=0.1)], outputs=cost, updates=updates, givens={ self.x: train_set_x[batch_begin:batch_end] } ) # append `fn` to the list of functions pretrain_fns.append(fn) return pretrain_fns
def _buildOptimizationFunction(self, X, n_steps, plr): mu_0,logcov_0 = self._inference(X) optdict = {} _, logcov_f, elbo_final = self._optimizeVariationalParams(X, mu_0, logcov_0, n_steps, plr, savedict = optdict) diff_elbo, _ = self._estimateELBOEntropy(optdict['elbo_its'][0],optdict['elbo_its'][-1], logcov_0, logcov_f) self.optimize_mu_logcov = theano.function([X, theano.In(n_steps, value=self.params['n_steps'], name='n_steps'), theano.In(plr, value=self.params['param_lr'], name='plr')], [optdict['elbo_its'], optdict['gradnorm_mu_its'], optdict['gradnorm_logcov_its'],optdict['elbo_its'].shape[0], diff_elbo], name = 'Optimize ELBO wrt mu/cov') diff_elbo, _ = self._estimateELBOEntropy(optdict['elbo_its'][0], optdict['elbo_its'][-1], logcov_0, logcov_f) self.final_elbo = theano.function([X, theano.In(n_steps, value=self.params['n_steps'], name='n_steps'), theano.In(plr, value=self.params['param_lr'], name='plr')], [optdict['elbo_its'][0],optdict['elbo_its'][-1], optdict['elbo_its'].shape[0], optdict['gradnorm_mu_its'][-1],optdict['gradnorm_logcov_its'][-1], diff_elbo], name = 'Optimize ELBO wrt mu/cov') self.init_final_params = theano.function([X, theano.In(n_steps, value=self.params['n_steps'], name='n_steps'), theano.In(plr, value=self.params['param_lr'], name='plr')], [optdict['mu_its'][0],optdict['logcov_its'][0], optdict['mu_its'][-1], optdict['logcov_its'][-1]], name = 'init/final params')
def __theano_build__(self): E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c x = T.ivector('x') y = T.ivector('y') def forward_prop_step(x_t, s_prev1, s_prev2, s_prev3): # Embedding layer x_e = E[:, x_t] def GRU(i, U, W, b, x_0, s_previous): z = T.nnet.hard_sigmoid(U[i * 3 + 0].dot(x_0) + W[i * 3 + 0].dot(s_previous) + b[i * 3]) r = T.nnet.hard_sigmoid(U[i * 3 + 1].dot(x_0) + W[i * 3 + 1].dot(s_previous) + b[i * 3 + 1]) s_candidate = T.tanh(U[i * 3 + 2].dot(x_0) + W[i * 3 + 2].dot(s_previous * r) + b[i * 3 + 2]) return (T.ones_like(z) - z) * s_candidate + z * s_previous # GRU Layer 1 s1 = GRU(0, U, W, b, x_e, s_prev1) # GRU Layer 2 s2 = GRU(1, U, W, b, s1, s_prev2) # GRU Layer 3 s3 = GRU(2, U, W, b, s2, s_prev3) # Final output calculation o_t = T.nnet.softmax(V.dot(s3) + c)[0] return [o_t, s1, s2, s3] x_e = E[:, x_t] [o, s1, s2, s3], updates = theano.scan(forward_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=[ None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)) ]) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) # Total cost cost = o_error # Gradients dE = T.grad(cost, E) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) # Assign functions self.predict = theano.function([x], [o], allow_input_downcast=True) self.predict_class = theano.function([x], prediction, allow_input_downcast=True) self.ce_error = theano.function([x, y], cost, allow_input_downcast=True) self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc], allow_input_downcast=True) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mE = decay * self.mE + (1 - decay) * dE**2 mU = decay * self.mU + (1 - decay) * dU**2 mW = decay * self.mW + (1 - decay) * dW**2 mV = decay * self.mV + (1 - decay) * dV**2 mb = decay * self.mb + (1 - decay) * db**2 mc = decay * self.mc + (1 - decay) * dc**2 self.sgd_step = theano.function( [x, y, learning_rate, theano.In(decay, value=0.9)], [], updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)), (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mE, mE), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc)], allow_input_downcast=True)
cache_mF = decay * mF + (1 - decay) * dF**2 cache_md = decay * md + (1 - decay) * dd**2 # RNN rmsprop cache updates. cache_mU_1 = decay * mU_1 + (1 - decay) * dU_1**2 cache_mU_2 = decay * mU_2 + (1 - decay) * dU_2**2 cache_mW = decay * mW + (1 - decay) * dW**2 cache_mV = decay * mV + (1 - decay) * dV**2 cache_mb = decay * mb + (1 - decay) * db**2 cache_mc = decay * mc + (1 - decay) * dc**2 cache_mh0_l1 = decay * mh0_l1 + (1 - decay) * dh0_l1**2 cache_mh0_l2 = decay * mh0_l2 + (1 - decay) * dh0_l2**2 sgd_step = theano.function( [x, sentences, y, learning_rate, theano.In(decay, value=0.9)], [], updates=[(U_1, U_1 - learning_rate * dU_1 / T.sqrt(mU_1 + 1e-6)), (U_2, U_2 - learning_rate * dU_2 / T.sqrt(mU_2 + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (F, F - learning_rate * dF / T.sqrt(mF + 1e-6)), (d, d - learning_rate * dd / T.sqrt(md + 1e-6)), (h0_l1, h0_l1 - learning_rate * dh0_l1 / T.sqrt(mh0_l1 + 1e-6)), (h0_l2, h0_l2 - learning_rate * dh0_l2 / T.sqrt(mh0_l2 + 1e-6)), (mU_1, cache_mU_1), (mU_2, cache_mU_2), (mW, cache_mW), (mV, cache_mV), (mb, cache_mb), (mc, cache_mc), (mF, cache_mF), (md, cache_md), (mh0_l1, cache_mh0_l1), (mh0_l2, cache_mh0_l2)]) sgd_step(X[0], test_text, Y[0], LEARNING_RATE)
def build_train_func(self, solver_mode="sgd", cost_factors=[], use_acc_mode=False, skip_build=False): #arguments to function logging.info( "Building training functions - solver: %s, use_acc_mode: %s" % (solver_mode, use_acc_mode)) iteration = tensor.fscalar() learn_rate = tensor.fscalar() momentum = tensor.fvector() decay = tensor.fscalar() #find costs self.yt = [] self.cost_list = [] self.cost_layers = [] self.cost_layer_names = [] for layer in self.layers: yt_index = tensor.lvector("target index %i" % len(self.cost_layers)) yt_value = tensor.fvector("target value %i" % len(self.cost_layers)) cost = layer.cost(yt_index, yt_value) if not cost is None: self.yt += [yt_index, yt_value] self.cost_list.append(cost) self.cost_layers.append(layer) self.cost_layer_names.append(layer.type_name) self.cost_factors = [1.0] * len(self.cost_list) if len( cost_factors) == 0 else cost_factors assert len(self.cost_factors) == len( self.cost_list ), "Different number of cost factors (%i) and cost layers (%i)" % (len( self.cost_factors), len(self.cost_layers)) logging.info("Found %i costs in model:" % len(self.cost_layers), list(zip(self.cost_layer_names, self.cost_factors))) self.train_cost = tensor.as_tensor_variable(0) for i, cost in enumerate(self.cost_list): self.train_cost += self.cost_factors[i] * cost if self.gradient_clip > 0.0: logging.info("Clipping gradient to [%f,%f]" % (-self.gradient_clip, self.gradient_clip)) self.train_cost = theano.gradient.grad_clip( self.train_cost, -self.gradient_clip, self.gradient_clip) #find split points split_points = [0] self.use_split_mode = False for index, layer in enumerate(self.layers): if layer.has_split: self.use_split_mode = True split_points.append(index) split_points.append(len(self.layers)) if self.use_split_mode: logging.verbose("Using split mode with split points:", split_points) self.func["train_fwd"] = [] self.func["train_bwd"] = [] self.updates = [] for sp in range(len(split_points) - 1): logging.info("Building training functions for layers %i-%i" % (split_points[sp], split_points[sp + 1])) split_start = self.layers[split_points[sp]] if sp > 0 else None split_end = self.layers[split_points[sp + 1]] if ( sp + 2) < len(split_points) else None split_cost = self.train_cost if split_end is None else None split_layers = [] for i, layer in enumerate(self.layers): if (i > split_points[sp]) and (i < split_points[sp + 1]): split_layers.append(layer) #determine known_grads provided by previous backward passes from collections import OrderedDict split_known_grads = OrderedDict() for i in range(sp + 1, len(split_points) - 1): split_known_grads.update( self.layers[split_points[i]].split_known_grads()) if len(split_known_grads) == 0: split_known_grads = None # print(split_known_grads) # print(split_known_grads) # print(sp+1, len(split_points)-1) # def get_sgd_updates(p, g): m = theano.shared(numpy.zeros(p.shape.eval(), dtype=theano.config.floatX), broadcastable=p.broadcastable, borrow=True) rho = tensor.switch(tensor.gt(iteration, 0), momentum[0], 0.0) m_update = rho * m + (1.0 - rho) * g p_update = p - learn_rate * m_update return [(p, p_update), (m, m_update)] def get_torch_updates(p, g): m = theano.shared(numpy.zeros(p.shape.eval(), dtype=theano.config.floatX), broadcastable=p.broadcastable, borrow=True) rho = tensor.switch(tensor.gt(iteration, 0), momentum[0], 0.0) m_update = rho * m + g p_update = p - learn_rate * (g + momentum[0] * m_update) return [(p, p_update), (m, m_update)] def get_adam_updates(p, g): eps = 1e-8 m = theano.shared(numpy.zeros(p.shape.eval(), dtype=theano.config.floatX), broadcastable=p.broadcastable, borrow=True) v = theano.shared(numpy.zeros(p.shape.eval(), dtype=theano.config.floatX), broadcastable=p.broadcastable, borrow=True) m_update = momentum[0] * m + (1.0 - momentum[0]) * g v_update = momentum[1] * v + (1.0 - momentum[1]) * (g * g) m_hat = m_update / (1.0 - tensor.pow(momentum[0], iteration + 1)) v_hat = v_update / (1.0 - tensor.pow(momentum[1], iteration + 1)) p_update = p - learn_rate * m_hat / (tensor.sqrt(v_hat) + eps) return [(p, p_update), (m, m_update), (v, v_update)] #append parameter updates params = [] params_decay = [] for layer in split_layers: params += layer.weights() params_decay += [True] * len(layer.weights()) params += layer.biases() params_decay += [False] * len(layer.biases()) #build updates print("known grads:", split_known_grads) grads = tensor.grad(split_cost, params, known_grads=split_known_grads) solver_updates = [] for p, g, p_decay in zip(params, grads, params_decay): #add L2 weight decay if needed if p_decay or self.bias_decay: g += decay * p if solver_mode == "adam": solver_updates += get_adam_updates(p, g) elif solver_mode == "torch" or solver_mode == "nesterov": solver_updates += get_torch_updates(p, g) else: solver_updates += get_sgd_updates(p, g) #append per layer updates local_updates = solver_updates + sum( [layer.updates(self.train_cost) for layer in split_layers], []) #all updates self.updates += local_updates #skipping actual theano function building (if you just want updates, etc) if skip_build: continue global debug_train if debug_train: logging.warning("WARNING: Debug mode is active!") from theano.compile.nanguardmode import NanGuardMode debug_mode = theano.compile.MonitorMode( post_func=debug_detect_errors) else: debug_mode = None if self.use_split_mode: if not split_end is None: updates = sum( [layer.split_forward() for layer in split_layers], []) updates += split_end.split_forward() print("fwd updates:", updates) f = theano.function([self.input], [], updates=updates, givens=[(denet.layer.get_train(), tensor.cast(1, 'int8'))], on_unused_input='ignore', mode=debug_mode) self.func["train_fwd"].append(f) outputs = ([self.train_cost] + self.cost_list) if split_end is None else [] updates = sum([ layer.split_backward(split_cost, split_known_grads) for layer in split_layers ], []) if not split_start is None: updates += split_start.split_backward( split_cost, split_known_grads) print("bwd updates:", updates) updates += local_updates f = theano.function([ denet.layer.get_epoch(), iteration, learn_rate, momentum, decay, self.input ] + self.yt, outputs, updates=updates, givens=[(denet.layer.get_train(), tensor.cast(1, 'int8'))], on_unused_input='ignore', mode=debug_mode) self.func["train_bwd"].insert(0, f) elif use_acc_mode: acc_counter = theano.shared( numpy.array(0, dtype=theano.config.floatX)) begin_updates = [(acc_counter, tensor.zeros_like(acc_counter))] step_updates = [(acc_counter, acc_counter + 1)] end_updates = [] self.acc_params = [] for p_dest, p_src in self.updates: p_acc = theano.shared(numpy.zeros( p_dest.shape.eval(), dtype=theano.config.floatX), broadcastable=p_dest.broadcastable, borrow=True) begin_updates.append((p_acc, tensor.zeros_like(p_acc))) step_updates.append((p_acc, p_acc + p_src)) end_updates.append((p_dest, p_acc / acc_counter)) self.acc_params.append(p_acc) logging.info( "Constructing parameter accumulate update functions (solver=%s)" % solver_mode) self.func["train_begin"] = theano.function( [], [], updates=begin_updates) self.func["train_step"] = theano.function( [ denet.layer.get_epoch(), iteration, learn_rate, momentum, decay, self.input ] + self.yt, [self.train_cost] + self.cost_list, updates=step_updates, givens=[(denet.layer.get_train(), tensor.cast(1, 'int8'))], on_unused_input='ignore', allow_input_downcast=True, mode=debug_mode) self.func["train_end"] = theano.function([], [], updates=end_updates) else: logging.info( "Constructing parameter update function (solver=%s)" % solver_mode) #making f_input = theano.In(self.input, borrow=True) f_yt = [theano.In(yt, borrow=True) for yt in self.yt] self.func["train_step"] = theano.function( [ denet.layer.get_epoch(), iteration, learn_rate, momentum, decay, f_input ] + f_yt, [self.train_cost] + self.cost_list, updates=self.updates, givens=[(denet.layer.get_train(), tensor.cast(1, 'int8'))], on_unused_input='ignore', allow_input_downcast=True, mode=debug_mode) logging.verbose("Exporting graph...") with open("graph.txt", "w") as f: theano.printing.debugprint(self.func["train_step"], file=f, print_type=True)
def __theano_build__(self): E, V, U, W, b, c, embedded = self.E, self.V, self.U, self.W, self.b, self.c, self.embedded x = T.ivector('x') y = T.ivector('y') def forward_prop_step(x_t, s_t1_prev, c_t1_prev, s_t2_prev, c_t2_prev): # This is how we calculated the hidden state in a simple RNN. No longer! # s_t = T.tanh(U[:,x_t] + W.dot(s_t1_prev)) # Word embedding layer x_e = E[:, x_t] # LSTM Layer i_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0]) f_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1]) o_t1 = T.nnet.hard_sigmoid(U[2].dot(x_e) + W[2].dot(s_t1_prev) + b[2]) g_t1 = T.tanh(U[3].dot(x_e) + W[3].dot(s_t1_prev) + b[3]) c_t1 = c_t1_prev * f_t1 + g_t1 * i_t1 s_t1 = T.tanh(c_t1) * o_t1 i_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4]) f_t2 = T.nnet.hard_sigmoid(U[5].dot(s_t1) + W[5].dot(s_t2_prev) + b[5]) o_t2 = T.nnet.hard_sigmoid(U[6].dot(s_t1) + W[6].dot(s_t2_prev) + b[6]) g_t2 = T.tanh(U[7].dot(s_t1) + W[7].dot(s_t2_prev) + b[7]) c_t2 = c_t2_prev * f_t2 + g_t2 * i_t2 s_t2 = T.tanh(c_t2) * o_t2 # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row o_t = T.nnet.softmax(V.dot(s_t2) + c)[0] return [o_t, s_t1, c_t1, s_t2, c_t2] [o, s1, cm1, s2, cm2 ], updates = theano.scan(forward_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=[ None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)), ]) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) # Total cost (could add regularization here) cost = o_error # Gradients dE = T.grad(cost, E) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) # Assign functions self.predict = theano.function([x], o) self.predict_class = theano.function([x], prediction) self.ce_error = theano.function([x, y], cost) if not embedded: self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc]) else: self.bptt = theano.function([x, y], [dU, dW, db, dV, dc]) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mE = decay * self.mE + (1 - decay) * dE**2 mU = decay * self.mU + (1 - decay) * dU**2 mW = decay * self.mW + (1 - decay) * dW**2 mV = decay * self.mV + (1 - decay) * dV**2 mb = decay * self.mb + (1 - decay) * db**2 mc = decay * self.mc + (1 - decay) * dc**2 if not embedded: self.sgd_step = theano.function( [x, y, learning_rate, theano.In(decay, value=0.9)], [], updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)), (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mE, mE), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc)]) else: self.sgd_step = theano.function( [x, y, learning_rate, theano.In(decay, value=0.9)], [], updates=[(U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc)])
print("curvLength test: ", curvLength(one, 0.0, 1.0)) print(quad(lambda x: fLength(one, x), 0.0, 1.0)) x = theano.tensor.dscalar() y = theano.tensor.dscalar() h = (5000 - 0.005 * (x * x + y * y + x * y) + 12.5 * (x + y)) * theano.tensor.exp(-abs(0.000001 * (x * x + y * y) - 0.0015 * (x + y) + 0.7)) fh = theano.function([x, y], h) gradH = theano.gradient.grad(h, [x, y]) gradHX = theano.function([x, y], gradH[0]) gradHY = theano.function([x, y], gradH[1]) if DEBUG: print(fh(0, 0), gradHX(0, 0), gradHY(0, 0), x.dtype) fhX0 = theano.function([y, theano.In(x, value=0)], h) gradHX0Y = theano.function([y, theano.In(x, value=0)], gradH[1]) eps = 1e-12 left = 0.0 right = 1600.0 while True: mid = (left + right) / 2 if mid == left or mid == right: break if gradHX0Y(mid) > 0: left = mid else: right = mid y0 = left
def _create_iter_funcs(self, layers, objective, update, output_type): y_batch = output_type('y_batch') objective_kw = self._get_params_for('objective') loss_train = objective(layers, target=y_batch, **objective_kw) loss_eval = objective(layers, target=y_batch, deterministic=True, **objective_kw) output_layer = self._output_layers predict_proba = get_output(output_layer, None, deterministic=True) if not self.regression: predict = predict_proba[0].argmax(axis=1) accuracy = T.mean(T.eq(predict, y_batch)) else: accuracy = loss_eval scores_train = [ s[1](predict_proba, y_batch) for s in self.scores_train ] scores_valid = [ s[1](predict_proba, y_batch) for s in self.scores_valid ] all_params = self.get_all_params(trainable=True) grads = theano.grad(loss_train, all_params) for idx, param in enumerate(all_params): grad_scale = getattr(param.tag, 'grad_scale', 1) if grad_scale != 1: grads[idx] *= grad_scale update_params = self._get_params_for('update') updates = update(grads, all_params, **update_params) input_layers = [ layer for layer in layers.values() if isinstance(layer, InputLayer) ] X_inputs = [ theano.In(input_layer.input_var, name=input_layer.name) for input_layer in input_layers ] inputs = X_inputs + [theano.In(y_batch, name="y")] train_iter = theano.function( inputs=inputs, outputs=[loss_train] + scores_train, updates=updates, allow_input_downcast=True, ) eval_iter = theano.function( inputs=inputs, outputs=[loss_eval, accuracy] + scores_valid, allow_input_downcast=True, ) predict_iter = theano.function( inputs=X_inputs, outputs=predict_proba, allow_input_downcast=True, ) return train_iter, eval_iter, predict_iter
out_2 = (1 + T.tanh(x / 2)) / 2 logistic = function([x], out) logistic_2 = function([x], out_2) a, b = T.dmatrices('a', 'b') diff = a - b abs_diff = abs(diff) diff_squared = diff**2 f = function([a, b], [diff, diff_squared, abs_diff]) # setting a default value for an argument x, y, w = T.dscalars('x', 'y', 'w') z = (x + y) * w f = function( [x, theano.In(y, value=1), theano.In(w, value=2, name='w_by_name')], z) # print(f(33)) # print(f(33, w_by_name = 10, y = 2)) # Using shared Variables state = shared(0) inc = T.iscalar('inc') accumulator = function([inc], state, updates=[(state, state + inc)]) decrementor = function([inc], state, updates=[(state, state - inc)]) fn_of_state = state * 2 + inc foo = T.scalar(dtype=state.dtype) skip_shared = function([inc, foo], fn_of_state, givens=[(state, foo)]) skip_shared(1, 3)
def train(args, trial=11, no_valid=False): # Creating unique strings to save for experiments. data_valid = "data/"+args.data_name+"_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) data_test = data_valid.replace("_valid_size", "_test_size") # If we want validation set to match modData of test set if modDataValid == 1: data_valid = data_valid.replace("_trial_", "_" + modData + "_trial_") data_test = data_test.replace("_trial_", "_" + modData + "_trial_") # By default, it is m0 data_train = "data/"+args.data_name+"_trial_"+str(trial)+"_train_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) subStr = "rnn_type_"+args.rnn_type + "_trial_"+str(trial) + "_hiddenSize_"+str(args.hidden_size)+\ "_numLayers_"+str(args.num_layers)+ \ "_dropout_"+str(args.dropout)+"_train_size_"+str(args.train_size) + "_transitions_"+str(args.transitions)+\ "_novalid_"+str(args.no_valid) if modData == "m1": data_train = data_train.replace("_trial_", "_m1_trial_") subStr = subStr.replace("_trial_", "_m1_trial_") elif modData == "m3": data_train = data_train.replace("_trial_", "_m3_trial_") subStr = subStr.replace("_trial_", "_m3_trial_") data_valid = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) data_test = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_test_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) print("on test: " + subStr) # Perform folder prefixing prefix_path = models_folder + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\ "_boost_"+bStr(args.boosting) load_path2 = prefix + load_path save_path2 = prefix + save_path last_path2 = prefix + last_path plots_output2 = plots_output + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\ "_boost_"+bStr(args.boosting) # obtain vocabulary size ix_to_char, char_to_ix, vocab_size = get_metadata( data_test.replace("_test", "")) print("vocab_size: " + str(vocab_size)) # Get train, valid, test streams sharedDataTrain, train_stream = get_stream_inGPU(data_train, sharedName='sharedData') train_streamCopy = copy.deepcopy(train_stream) sharedDataValid, dev_stream = get_stream_inGPU(data_valid, sharedName='sharedData') valid_streamCopy = copy.deepcopy(dev_stream) sharedDataTest, test_stream = get_stream_inGPU(data_test, sharedName='sharedData') test_streamCopy = copy.deepcopy(test_stream) # Create dummy sums sharedMRRSUM = shared(np.array(0.0, dtype=theano.config.floatX)) sharedTOTSUM = shared(np.array(0.0, dtype=theano.config.floatX)) sharedSUMVARs = { 'sharedMRRSUM': sharedMRRSUM, 'sharedTOTSUM': sharedTOTSUM } # Initialize batches batch_index_From = T.scalar('int_stream_From', dtype='int32') batch_index_To = T.scalar('int_stream_To', dtype='int32') # Index theano variables x = sharedDataTrain['x'][:, batch_index_From:batch_index_To] x.name = 'x' x_mask = sharedDataTrain['x_mask'][:, batch_index_From:batch_index_To] x_mask.name = 'x_mask' x_mask_o = sharedDataTrain['x_mask_o'][:, batch_index_From:batch_index_To] x_mask_o.name = 'x_mask_o' x_mask_o_mask = sharedDataTrain[ 'x_mask_o_mask'][:, batch_index_From:batch_index_To] x_mask_o_mask.name = 'x_mask_o_mask' y = sharedDataTrain['y'][:, batch_index_From:batch_index_To] y.name = 'y' y_mask = sharedDataTrain['y_mask'][:, batch_index_From:batch_index_To] y_mask.name = 'y_mask' y_mask_o = sharedDataTrain['y_mask_o'][:, batch_index_From:batch_index_To] y_mask_o.name = 'y_mask_o' y_mask_o_mask = sharedDataTrain[ 'y_mask_o_mask'][:, batch_index_From:batch_index_To] y_mask_o_mask.name = 'y_mask_o_mask' lens = sharedDataTrain['lens'][:, batch_index_From:batch_index_To] lens.name = 'lens' # Generate temp shared vars tempSharedData = {} tempSharedData[theano.config.floatX] = [ shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)) ] tempSharedData['uint8'] = [ shared(np.array([[0], [0]], dtype='uint8')), shared(np.array([[0], [0]], dtype='uint8')), shared(np.array([[0], [0]], dtype='uint8')) ] # Final mask is due to the generated mask and the input mask x_mask_final = x_mask * x_mask_o * x_mask_o_mask y_mask_final = y_mask * y_mask_o * y_mask_o_mask # Build neural network linear_output, cost = nn_fprop( x, x_mask_final, y, y_mask_final, lens, vocab_size, hidden_size, num_layers, rnn_type, boosting=boosting, scan_kwargs={'truncate_gradient': truncate_gradient}) # Keep a constant in gpu memory constant1 = shared(np.float32(1.0)) cost_int, ymasksum = RR_cost(y, linear_output, y_mask_final, constant1) # Validation calculations fRR = function(inputs=[ theano.In(batch_index_From, borrow=True), theano.In(batch_index_To, borrow=True) ], updates=[(sharedMRRSUM, sharedMRRSUM + cost_int), (sharedTOTSUM, sharedTOTSUM + ymasksum)]) # COST cg = ComputationGraph(cost) if dropout > 0: # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015) inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')( cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [ RMSProp(learning_rate=rmsPropLearnRate, decay_rate=decay_rate), StepClipping(step_clipping) ] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions # This is for tracking our best result trackbest = track_best('valid_MRR', save_path2, last_path2, num_epochs, nepochs, maxIterations, epsilon, tempSharedData) if onlyPlots: prefixes = ["train_cross", "valid_cross", "test_cross"] gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] #this is faster train_monitor = myTrainingDataMonitoring( variables=monitored_vars, prefix=prefixes[0], after_batch=True, saveEveryXIteration=saveEveryXIteration) #train_monitor = DataStreamMonitoringPlot(variables=[cost], # data_stream=train_streamCopy, prefix=prefixes[0], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTrain, after_batch=True, saveEveryXIteration = saveEveryXIteration) valid_monitor = DataStreamMonitoringPlot( variables=[cost], data_stream=valid_streamCopy, prefix=prefixes[1], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataValid, after_batch=True, saveEveryXIteration=saveEveryXIteration) test_monitor = DataStreamMonitoringPlot( variables=[cost], data_stream=test_streamCopy, prefix=prefixes[2], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTest, after_batch=True, saveEveryXIteration=saveEveryXIteration) trackbest = [trackbest[0], trackbest[2], trackbest[3], trackbest[4]] plot = Plot('Live Plotting', saveFolder=plots_output2, channels=[ 'train_cross_cost', 'valid_cross_cost', 'test_cross_cost' ], numProcesses=numProcesses, saveEveryXIteration=saveEveryXIteration, after_batch=True) extensions = [ train_monitor, valid_monitor, test_monitor, plot, Printing(), ProgressBar(), ] + trackbest else: dev_monitor = myDataStreamMonitoring(after_epoch=True, before_epoch=False, data_stream=dev_stream, prefix="valid", fRR=fRR, sharedVars=sharedSUMVARs, sharedDataTrain=sharedDataTrain, sharedDataValid=sharedDataValid) extensions = [ dev_monitor, Printing(), ProgressBar(), ] + trackbest if learning_rate_decay not in (0, 1): extensions.append( SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: np.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=True, after_batch=False)) print 'number of parameters in the model: ' + str( T.sum([p.size for p in cg.parameters]).eval()) # Finally build the main loop and train the model main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
vU2_upd = beta2 * vU2 + (1 - beta2) * dU2**2 vW1_upd = beta2 * vW1 + (1 - beta2) * dW1**2 vW2_upd = beta2 * vW2 + (1 - beta2) * dW2**2 vb1_upd = beta2 * vb1 + (1 - beta2) * db1**2 vb2_upd = beta2 * vb2 + (1 - beta2) * db2**2 vV_upd = beta2 * vV + (1 - beta2) * dV**2 vc_upd = beta2 * vc + (1 - beta2) * dc**2 learning_rate_upd = learning_rate * T.cast(T.sqrt( (1 - beta2**t_upd) / (1 - beta1**t_upd)), dtype='float32') apply_grads = theano.function( [ x, learning_rate, theano.In(beta1, value=0.9), theano.In(beta2, value=0.99), theano.In(epsilon, value=1e-16) ], [], updates=[ (U1, U1 - learning_rate_upd * mU1_upd / (T.sqrt(vU1_upd) + epsilon)), (U2, U2 - learning_rate_upd * mU2_upd / (T.sqrt(vU2_upd) + epsilon)), (W1, W1 - learning_rate_upd * mW1_upd / (T.sqrt(vW1_upd) + epsilon)), (W2, W2 - learning_rate_upd * mW2_upd / (T.sqrt(vW2_upd) + epsilon)), (b1, b1 - learning_rate_upd * mb1_upd / (T.sqrt(vb1_upd) + epsilon)), (b2, b2 - learning_rate_upd * mb2_upd / (T.sqrt(vb2_upd) + epsilon)), (V, V - learning_rate_upd * mV_upd / (T.sqrt(vV_upd) + epsilon)), (c, c - learning_rate_upd * mc_upd / (T.sqrt(vc_upd) + epsilon)), (mU1, mU1_upd), (mU2, mU2_upd), (mW1, mW1_upd), (mW2, mW2_upd), (mb1, mb1_upd), (mb2, mb2_upd), (mV, mV_upd), (mc, mc_upd), (vU1, vU1_upd), (vU2, vU2_upd), (vW1, vW1_upd), (vW2, vW2_upd),
def __theano_build(self): E, U, W, V, b, c = self.E, self.U, self.W, self.V, self.b, self.c x = T.fmatrix('x') y = T.fvector('y') #implementation of ReLU activator def ReLU(x): return T.switch(x < 0, 0, x) def forward_prop_step(x_t, s_prev): #Embedding Layer with ReLU non-linearity x_e = ReLU(E.dot(x_t)) # GRU Layer 1 z_t = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_prev) + b[0]) r_t = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_prev) + b[1]) c_t = ReLU(U[2].dot(x_e) + W[2].dot(s_prev * r_t) + b[2]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_prev #prediction at time t+1 o_t = V.dot(s_t) + c return [o_t, s_t] #feed-forward for training example. #initializing the hidden state with first 8 steps [o, s1], updates1 = theano.scan( forward_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=[None, dict(initial=T.zeros(self.hidden_dim))]) #using first 8 steps to predict the future trajectory loss = T.dot((o[-1] - y), (o[-1] - y)) #back-propogation through time. Truncation is handled upon calculating o. dE = T.grad(loss, E) dU = T.grad(loss, U) dW = T.grad(loss, W) db = T.grad(loss, b) dV = T.grad(loss, V) dc = T.grad(loss, c) #Stochastic Gradient Descent #sgd parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') #RMSProp updates mE = decay * self.mE + (1 - decay) * dE**2 mU = decay * self.mU + (1 - decay) * dU**2 mW = decay * self.mW + (1 - decay) * dW**2 mV = decay * self.mV + (1 - decay) * dV**2 mb = decay * self.mb + (1 - decay) * db**2 mc = decay * self.mc + (1 - decay) * dc**2 #1e-6 gaurds against division by 0 #gradient descent update of parameters self.sgd_step = theano.function( [x, y, learning_rate, theano.In(decay, value=0.9)], [], allow_input_downcast=True, updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)), (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc)]) self.predict = theano.function([x], o[-1], allow_input_downcast=True) self.loss = theano.function([x, y], loss, allow_input_downcast=True) def cost(X, Y): return (np.sum([self.loss(x, y) for x, y in zip(X, Y)])) / len(X) self.cost = cost
mV_upd = beta1 * mV + (1 - beta1) * dV mc_upd = beta1 * mc + (1 - beta1) * dc vU1_upd = beta2 * vU1 + (1 - beta2) * dU1 ** 2 vU2_upd = beta2 * vU2 + (1 - beta2) * dU2 ** 2 vW1_upd = beta2 * vW1 + (1 - beta2) * dW1 ** 2 vW2_upd = beta2 * vW2 + (1 - beta2) * dW2 ** 2 vb1_upd = beta2 * vb1 + (1 - beta2) * db1 ** 2 vb2_upd = beta2 * vb2 + (1 - beta2) * db2 ** 2 vV_upd = beta2 * vV + (1 - beta2) * dV ** 2 vc_upd = beta2 * vc + (1 - beta2) * dc ** 2 learning_rate_upd = learning_rate * T.cast(T.sqrt((1 - beta2 ** t_upd) / (1 - beta1 ** t_upd)), dtype='float32') apply_grads = theano.function( [x, learning_rate, theano.In(beta1, value= 0.9), theano.In(beta2, value= 0.99), theano.In(epsilon, value= 1e-16)], [], updates=[(U1, U1 - learning_rate_upd * mU1_upd / (T.sqrt(vU1_upd) + epsilon)), (U2, U2 - learning_rate_upd * mU2_upd / (T.sqrt(vU2_upd) + epsilon)), (W1, W1 - learning_rate_upd * mW1_upd / (T.sqrt(vW1_upd) + epsilon)), (W2, W2 - learning_rate_upd * mW2_upd / (T.sqrt(vW2_upd) + epsilon)), (b1, b1 - learning_rate_upd * mb1_upd / (T.sqrt(vb1_upd) + epsilon)), (b2, b2 - learning_rate_upd * mb2_upd / (T.sqrt(vb2_upd) + epsilon)), (V, V - learning_rate_upd * mV_upd / (T.sqrt(vV_upd) + epsilon)), (c, c - learning_rate_upd * mc_upd / (T.sqrt(vc_upd) + epsilon)), (mU1, mU1_upd), (mU2, mU2_upd), (mW1, mW1_upd), (mW2, mW2_upd), (mb1, mb1_upd),
def init_function(self): sigmoid, tanh = T.nnet.sigmoid, T.tanh logging.info('init function...') self.seq_idx = T.lvector() self.solution = T.matrix() self.seq_matrix = T.take(self.Vw, self.seq_idx, axis=0) h, c = T.zeros_like(self.bf, dtype=theano.config.floatX), T.zeros_like( self.bc, dtype=theano.config.floatX) def encode(x_t, h_fore, c_fore): v = T.concatenate([h_fore, x_t]) f_t = T.nnet.sigmoid(T.dot(self.Wf, v) + self.bf) i_t = T.nnet.sigmoid(T.dot(self.Wi, v) + self.bi) o_t = T.nnet.sigmoid(T.dot(self.Wo, v) + self.bo) c_next = f_t * c_fore + i_t * T.tanh(T.dot(self.Wc, v) + self.bc) h_next = o_t * T.tanh(c_next) return h_next, c_next scan_result, _ = theano.scan(fn=encode, sequences=[self.seq_matrix], outputs_info=[h, c]) embedding = scan_result[0][-1] self.use_noise = theano.shared( np.asarray(0., dtype=theano.config.floatX)) if self.dropout == 1: embedding_for_train = embedding * self.srng.binomial( embedding.shape, p=0.5, n=1, dtype=embedding.dtype) embedding_for_test = embedding * 0.5 else: embedding_for_train = embedding embedding_for_test = embedding self.pred_for_train = T.nnet.softmax( T.dot(embedding_for_train, self.Ws) + self.bs) self.pred_for_test = T.nnet.softmax( T.dot(embedding_for_test, self.Ws) + self.bs) self.l2 = sum([T.sum(param**2) for param in self.params]) - T.sum(self.Vw**2) self.loss_sen = -T.tensordot( self.solution, T.log(self.pred_for_train), axes=2) self.loss_l2 = 0.5 * self.l2 * self.regular self.loss = self.loss_sen + self.loss_l2 logging.info('getting grads...') grads = T.grad(self.loss, self.params) self.updates = collections.OrderedDict() self.grad = {} for param, grad in zip(self.params, grads): g = theano.shared(np.asarray(np.zeros_like(param.get_value()), \ dtype=theano.config.floatX)) self.grad[param] = g self.updates[g] = g + grad logging.info("compiling func of train...") self.func_train = theano.function( inputs=[ self.seq_idx, self.solution, theano.In(h, value=self.h0), theano.In(c, value=self.c0) ], outputs=[self.loss, self.loss_sen, self.loss_l2], updates=self.updates, on_unused_input='warn') logging.info("compiling func of test...") self.func_test = theano.function(inputs=[ self.seq_idx, theano.In(h, value=self.h0), theano.In(c, value=self.c0) ], outputs=self.pred_for_test, on_unused_input='warn') self.func_encode = theano.function(inputs=[ self.seq_idx, theano.In(h, value=self.h0), theano.In(c, value=self.c0) ], outputs=embedding, on_unused_input='warn')
def build_minibatch(self, batch_size): ''' dimension: n_steps * batch_size * embed_dim :return: ''' V, U, W, b, c = self.V, self.U, self.W, self.b, self.c x = T.tensor3('x') y = T.ivector('y') m = T.ivector('mask') self.batch_size = batch_size def forward_prop_step(x_t, s_t_prev): # GRU Layer z_t = T.nnet.hard_sigmoid(T.dot(x_t, U[0]) + T.dot(s_t_prev, W[0]) + b[0]) r_t = T.nnet.hard_sigmoid(T.dot(x_t, U[1]) + T.dot(s_t_prev, W[1]) + b[1]) c_t = T.tanh(T.dot(x_t, U[2]) + T.dot((s_t_prev*r_t), W[2]) + b[2]) s_t = (T.ones_like(z_t) - z_t) * c_t + z_t * s_t_prev y_t = T.nnet.softmax(T.dot(s_t, V) + c) return [s_t, y_t] [s, y_t], _ = theano.scan( forward_prop_step, sequences=[x], truncate_gradient=self.bptt_truncate, outputs_info=[dict(initial=T.zeros((batch_size, self.hidden_dim))), None]) # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row # p_y = T.nnet.softmax(T.dot(s[-1], V) + c) # [0] y_t = y_t.dimshuffle((1,0,2)).reshape((y_t.shape[0]*y_t.shape[1], y_t.shape[2])) y_t1 = y_t[np.nonzero(m)] p_y = T.argmax(y_t1, axis=1) o_error = T.mean(T.nnet.categorical_crossentropy(y_t1, y)) # Total cost (could add regularization here) self.cost = o_error # Assign functions self.predict = theano.function([x, m], y_t1) self.predict_class = theano.function([x, m], p_y) self.ce_error = theano.function([x, y, m], self.cost) # # Gradients dU = T.grad(self.cost, U) dW = T.grad(self.cost, W) db = T.grad(self.cost, b) dV = T.grad(self.cost, V) dc = T.grad(self.cost, c) self.bptt = theano.function([x, y, m], [dU, dW, db, dV, dc]) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mU = decay * self.mU + (1 - decay) * dU ** 2 mW = decay * self.mW + (1 - decay) * dW ** 2 mV = decay * self.mV + (1 - decay) * dV ** 2 mb = decay * self.mb + (1 - decay) * db ** 2 mc = decay * self.mc + (1 - decay) * dc ** 2 self.f_update = theano.function( [x, y, m, learning_rate, theano.In(decay, value=0.9)], [], updates=[ (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc) ])
def fit(self, X, bounds=None, constraints=None, use_gradient=True, optimizer=None, **kwargs): """Fit the distribution parameters to data by minimizing the negative log-likelihood of the data. Parameters ---------- * `X` [array-like, shape=(n_samples, n_features)]: The samples. * `bounds` [list of (parameter, (low, high))]: The parameter bounds. * `constraints`: The constraints on the parameters. * `use_gradient` [boolean, default=True]: Whether to use exact gradients (if `True`) or numerical gradients (if `False`). * `optimizer` [string]: The optimization method. Returns ------- * `self` [object]: `self`. """ # Map parameters to placeholders param_to_placeholder = [] param_to_index = {} for i, v in enumerate(self.parameters_): w = T.TensorVariable(v.type) param_to_placeholder.append((v, w)) param_to_index[v] = i # Build bounds mapped_bounds = None if bounds is not None: mapped_bounds = [(None, None) for v in param_to_placeholder] for b in bounds: mapped_bounds[param_to_index[b["param"]]] = b["bounds"] # Build constraints mapped_constraints = None if constraints is not None: mapped_constraints = [] for c in constraints: args = c["param"] if isinstance(args, SharedVariable): args = (args, ) m_c = { "type": c["type"], "fun": lambda x: c["fun"](*[x[param_to_index[a]] for a in args]) } if "jac" in c: m_c["jac"] = lambda x: c["jac"]( *[x[param_to_index[a]] for a in args]) mapped_constraints.append(m_c) # Derive objective and gradient objective_ = theano.function( [self.X] + [w for _, w in param_to_placeholder] + [theano.In(v, name=v.name) for v in self.observeds_], T.sum(self.nll_), givens=param_to_placeholder, allow_input_downcast=True) def objective(x): return objective_(X, *x, **kwargs) / len(X) if use_gradient: gradient_ = theano.function( [self.X] + [w for _, w in param_to_placeholder] + [theano.In(v, name=v.name) for v in self.observeds_], theano.grad(T.sum(self.nll_), [v for v, _ in param_to_placeholder]), givens=param_to_placeholder, allow_input_downcast=True) def gradient(x): return np.array(gradient_(X, *x, **kwargs)) / len(X) # Solve! x0 = np.array([v.get_value() for v, _ in param_to_placeholder]) r = minimize(objective, jac=gradient if use_gradient else None, x0=x0, method=optimizer, bounds=mapped_bounds, constraints=mapped_constraints) if r.success: # Assign the solution for i, value in enumerate(r.x): param_to_placeholder[i][0].set_value(value) else: print("Parameter fitting failed!") print(r) return self
# 5 - theano.function """ Please note, this code is only for python 3+. If you are using python 2+, please modify the code accordingly. """ from __future__ import print_function import numpy as np import theano import theano.tensor as T # activation function example x = T.dmatrix('x') s = 1 / (1 + T.exp(-x)) # logistic or soft step logistic = theano.function([x], s) print(logistic([[0, 1], [-1, -2]])) # multiply outputs for a function a, b = T.dmatrices('a', 'b') diff = a - b abs_diff = abs(diff) diff_squared = diff**2 f = theano.function([a, b], [diff, abs_diff, diff_squared]) print(f(np.ones((2, 2)), np.arange(4).reshape((2, 2)))) # default value and name for a function x, y, w = T.dscalars('x', 'y', 'w') z = (x + y) * w f = theano.function( [x, theano.In(y, value=1), theano.In(w, value=2, name='weights')], z) print(f(23, 2, weights=4))
def fit(self, X, bounds=None, constraints=None, use_gradient=True, **kwargs): # Map parameters to placeholders param_to_placeholder = [] param_to_index = {} for i, v in enumerate(self.parameters_): w = T.TensorVariable(v.type) param_to_placeholder.append((v, w)) param_to_index[v] = i # Build bounds mapped_bounds = None if bounds is not None: mapped_bounds = [(None, None) for v in param_to_placeholder] for b in bounds: mapped_bounds[param_to_index[b["param"]]] = b["bounds"] # Build constraints mapped_constraints = None if constraints is not None: mapped_constraints = [] for c in constraints: args = c["param"] if isinstance(args, SharedVariable): args = (args, ) m_c = { "type": c["type"], "fun": lambda x: c["fun"](*[x[param_to_index[a]] for a in args]) } if "jac" in c: m_c["jac"] = lambda x: c["jac"](*[x[param_to_index[a]] for a in args]) mapped_constraints.append(m_c) # Derive objective and gradient objective_ = theano.function( [self.X] + [w for _, w in param_to_placeholder] + [theano.In(v, name=v.name) for v in self.observeds_], T.sum(self.nnlf_), givens=param_to_placeholder, allow_input_downcast=True) def objective(x): return objective_(X, *x, **kwargs) / len(X) if use_gradient: gradient_ = theano.function( [self.X] + [w for _, w in param_to_placeholder] + [theano.In(v, name=v.name) for v in self.observeds_], theano.grad(T.sum(self.nnlf_), [v for v, _ in param_to_placeholder]), givens=param_to_placeholder, allow_input_downcast=True) def gradient(x): return np.array(gradient_(X, *x, **kwargs)) / len(X) # Solve! x0 = np.array([v.get_value() for v, _ in param_to_placeholder]) r = minimize(objective, jac=gradient if use_gradient else None, x0=x0, method=self.optimizer, bounds=mapped_bounds, constraints=mapped_constraints) if r.success: # Assign the solution for i, value in enumerate(r.x): param_to_placeholder[i][0].set_value(value) else: print("Parameter fitting failed!") print(r) return self
def __theano_build__(self): E, V, U, W, b, c = self.E, self.V, self.U, self.W, self.b, self.c x = T.ivector('x') y = T.ivector('y') def forward_prop_step(x_t, s_t1_prev, s_t2_prev, s_t3_prev): # Word embedding layer x_e = E[:, x_t] # GRU Layer 1 z_t1 = T.nnet.hard_sigmoid(U[0].dot(x_e) + W[0].dot(s_t1_prev) + b[0]) r_t1 = T.nnet.hard_sigmoid(U[1].dot(x_e) + W[1].dot(s_t1_prev) + b[1]) c_t1 = T.tanh(U[2].dot(x_e) + W[2].dot(s_t1_prev * r_t1) + b[2]) s_t1 = (T.ones_like(z_t1) - z_t1) * c_t1 + z_t1 * s_t1_prev # GRU Layer 2 z_t2 = T.nnet.hard_sigmoid(U[3].dot(s_t1) + W[3].dot(s_t2_prev) + b[3]) r_t2 = T.nnet.hard_sigmoid(U[4].dot(s_t1) + W[4].dot(s_t2_prev) + b[4]) c_t2 = T.tanh(U[5].dot(s_t1) + W[5].dot(s_t2_prev * r_t2) + b[5]) s_t2 = (T.ones_like(z_t2) - z_t2) * c_t2 + z_t2 * s_t2_prev # GRU Layer 3 z_t3 = T.nnet.hard_sigmoid(U[6].dot(s_t2) + W[6].dot(s_t3_prev) + b[6]) r_t3 = T.nnet.hard_sigmoid(U[7].dot(s_t2) + W[7].dot(s_t3_prev) + b[7]) c_t3 = T.tanh(U[8].dot(s_t2) + W[8].dot(s_t3_prev * r_t3) + b[8]) s_t3 = (T.ones_like(z_t3) - z_t3) * c_t3 + z_t3 * s_t3_prev # Final output calculation # Theano's softmax returns a matrix with one row, we only need the row o_t = T.nnet.softmax(V.dot(s_t3) + c)[0] return [o_t, s_t1, s_t2, s_t3] [o, s, s2, s3], updates = theano.scan(forward_prop_step, sequences=x, truncate_gradient=self.bptt_truncate, outputs_info=[ None, dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)), dict(initial=T.zeros(self.hidden_dim)) ]) prediction = T.argmax(o, axis=1) o_error = T.sum(T.nnet.categorical_crossentropy(o, y)) p_o = printing.Print('o_error') # Total cost (could add regularization here) cost = p_o(o_error) # Gradients dE = T.grad(cost, E) dU = T.grad(cost, U) dW = T.grad(cost, W) db = T.grad(cost, b) dV = T.grad(cost, V) dc = T.grad(cost, c) # Assign functions self.predict = theano.function([x], [o], allow_input_downcast=True) self.predict_class = theano.function([x], prediction, allow_input_downcast=True) self.ce_error = theano.function([x, y], cost, allow_input_downcast=True) self.bptt = theano.function([x, y], [dE, dU, dW, db, dV, dc], allow_input_downcast=True) # SGD parameters learning_rate = T.scalar('learning_rate') decay = T.scalar('decay') # rmsprop cache updates mE = decay * self.mE + (1 - decay) * dE**2 mU = decay * self.mU + (1 - decay) * dU**2 mW = decay * self.mW + (1 - decay) * dW**2 mV = decay * self.mV + (1 - decay) * dV**2 mb = decay * self.mb + (1 - decay) * db**2 mc = decay * self.mc + (1 - decay) * dc**2 self.sgd_step = theano.function( [x, y, learning_rate, theano.In(decay, value=0.9)], [], updates=[(E, E - learning_rate * dE / T.sqrt(mE + 1e-6)), (U, U - learning_rate * dU / T.sqrt(mU + 1e-6)), (W, W - learning_rate * dW / T.sqrt(mW + 1e-6)), (V, V - learning_rate * dV / T.sqrt(mV + 1e-6)), (b, b - learning_rate * db / T.sqrt(mb + 1e-6)), (c, c - learning_rate * dc / T.sqrt(mc + 1e-6)), (self.mE, mE), (self.mU, mU), (self.mW, mW), (self.mV, mV), (self.mb, mb), (self.mc, mc)], allow_input_downcast=True)