def test_no_complex(): width_var = tensor.cscalar() freq_var = tensor.fscalar() signal_var = tensor.fscalar() stft_out = tensor.exp(width_var * freq_var) * signal_var theano.function([width_var, freq_var, signal_var], stft_out, mode=mode_with_gpu)
def test_default_dtype(self): random = RandomStreams(utt.fetch_seed()) low = tensor.dscalar() high = tensor.dscalar() # Should not silently downcast from low and high out0 = random.uniform(low=low, high=high, size=(42,)) assert out0.dtype == 'float64' f0 = function([low, high], out0) val0 = f0(-2.1, 3.1) assert val0.dtype == 'float64' # Should downcast, since asked explicitly out1 = random.uniform(low=low, high=high, size=(42,), dtype='float32') assert out1.dtype == 'float32' f1 = function([low, high], out1) val1 = f1(-1.1, 1.1) assert val1.dtype == 'float32' # Should use floatX lowf = tensor.fscalar() highf = tensor.fscalar() outf = random.uniform(low=lowf, high=highf, size=(42,)) assert outf.dtype == config.floatX ff = function([lowf, highf], outf) valf = ff(numpy.float32(-0.1), numpy.float32(0.3)) assert valf.dtype == config.floatX
def create_learning_rate_func(solver_params): base = tt.fscalar('base') gamma = tt.fscalar('gamma') power = tt.fscalar('power') itrvl = tt.fscalar('itrvl') iter = tt.scalar('iter') if solver_params['lr_type']=='inv': lr_ = base * tt.pow(1 + gamma * iter, -power) lr = t.function( inputs=[iter, t.Param(base, default=solver_params['base']), t.Param(gamma, default=solver_params['gamma']), t.Param(power, default=solver_params['power'])], outputs=lr_) elif solver_params['lr_type']=='fixed': lr_ = base lr = t.function( inputs=[iter, t.Param(base, default=solver_params['base'])], outputs=lr_, on_unused_input='ignore') elif solver_params['lr_type']=='episodic': lr_ = base / (tt.floor(iter/itrvl) + 1) lr = t.function( inputs=[iter, t.Param(base, default=solver_params['base']), t.Param(itrvl, default=solver_params['interval'])], outputs=lr_, on_unused_input='ignore') return lr
def __init__(self, vocabulary_size, hidden_size, output_size): X = tensor.ivector() Y = tensor.ivector() keep_prob = tensor.fscalar() learning_rate = tensor.fscalar() emb_layer = Embedding(vocabulary_size, hidden_size) lstm_layer = BiLSTM(hidden_size, hidden_size) dropout_layer = Dropout(keep_prob) fc_layer = FullConnect(2*hidden_size, output_size) crf = CRF(output_size) # graph defination X_emb = emb_layer(X) scores = fc_layer(tensor.tanh(lstm_layer(dropout_layer(X_emb)))) loss, predict = crf(scores, Y, isTraining=True) # loss, predict and accuracy accuracy = tensor.sum(tensor.eq(predict, Y)) * 1.0 / Y.shape[0] params = emb_layer.params + lstm_layer.params + fc_layer.params + crf.params updates = MomentumSGD(loss, params, lr=learning_rate) print("Compiling train function: ") train = theano.function(inputs=[X, Y, keep_prob, learning_rate], outputs=[predict, accuracy, loss], updates=updates, allow_input_downcast=True) print("Compiling evaluate function: ") evaluate = theano.function(inputs=[X_emb, Y, keep_prob], outputs=[predict, accuracy, loss], allow_input_downcast=True) self.embedding_tensor = emb_layer.params[0] self.train = train self.evaluate = evaluate self.params = params
def __init__(self, n_comp=10, verbose=False): # Theano initialization self.T_weights = shared(np.eye(n_comp, dtype=np.float32)) self.T_bias = shared(np.ones((n_comp, 1), dtype=np.float32)) T_p_x_white = T.fmatrix() T_lrate = T.fscalar() T_block = T.fscalar() T_unmixed = T.dot(self.T_weights,T_p_x_white) + T.addbroadcast(self.T_bias,1) T_logit = 1 - 2 / (1 + T.exp(-T_unmixed)) T_out = self.T_weights + T_lrate * T.dot(T_block * T.identity_like(self.T_weights) + T.dot(T_logit, T.transpose(T_unmixed)), self.T_weights) T_bias_out = self.T_bias + T_lrate * T.reshape(T_logit.sum(axis=1), (-1,1)) T_max_w = T.max(self.T_weights) T_isnan = T.any(T.isnan(self.T_weights)) self.w_up_fun = theano.function([T_p_x_white, T_lrate, T_block], [T_max_w, T_isnan], updates=[(self.T_weights, T_out), (self.T_bias, T_bias_out)], allow_input_downcast=True) T_matrix = T.fmatrix() T_cov = T.dot(T_matrix,T.transpose(T_matrix))/T_block self.cov_fun = theano.function([T_matrix, T_block], T_cov, allow_input_downcast=True) self.loading = None self.sources = None self.weights = None self.n_comp = n_comp self.verbose = verbose
def generate_theano_functions(self, next_layer): '''Compile necessary theano functions''' exp = tensor.fmatrix('expected') rate = tensor.fscalar('rate') momentum = tensor.fscalar('momentum') ##Compute outputs given inputs self.get_output = theano.function([], updates = [(self.outputs, tensor.nnet.sigmoid( tensor.dot( self.inputs, self.weights)))], name='get_output') ##Compute error values given errors of previous layer if self.output: self.find_errors = theano.function([exp], updates = [(self.errors, self.outputs * (1 - self.outputs) * exp)], name='find_errors', allow_input_downcast=True) else: self.find_errors = theano.function([], updates = [(self.errors, self.outputs * (1 - self.outputs) * tensor.dot(next_layer.errors, next_layer.weights.T))], name='find_errors') ##Compute the change to the weight vector using stochastic gradient ##descent with momentum self.train_compute = theano.function([rate, momentum], updates = [(self.delta_weights, self.delta_weights * momentum + theano.tensor.dot(self.inputs.T, (rate * self.errors)))], name='train_compute', allow_input_downcast=True) ##Adjust weights using the delta_w computed in train_compute self.adjust = theano.function([], updates=[(self.weights, self.weights + self.delta_weights)], name='adjust') ##Drop a number of nodes roughly equal to rate/output_size self.dropout = theano.function([rate], updates = [(self.outputs, tensor.switch( self.random.binomial(size=(1, self.output_size), p=rate), self.outputs / rate, 0))], name='dropout', allow_input_downcast=True)
def __build_iterative_functions(self): def states_dot(lambda_x, lambda_y, x_data, y_data): [x_dot, h_dot, y_dot] = T.grad(-self.energy_sum, self.states) x_dot_final = lambda_x * (x_data - self.x) + (1. - lambda_x) * x_dot y_dot_final = lambda_y * (y_data - self.y) + (1. - lambda_y) * y_dot return [x_dot_final, h_dot, y_dot_final] lambda_x = T.fscalar('lambda_x') lambda_y = T.fscalar('lambda_y') x_data = self.outside_world.x_data y_data = self.outside_world.y_data_one_hot states_dot = [x_dot, h_dot, y_dot] = states_dot(lambda_x, lambda_y, x_data, y_data) kinetic_energy = T.mean( sum( [(state_dot ** 2).sum(axis=1) for state_dot in states_dot] ) ) params_dot = T.grad(kinetic_energy, self.params) # UPDATES epsilon = T.fscalar('epsilon') alpha_W1 = T.fscalar('alpha_W1') alpha_W2 = T.fscalar('alpha_W2') learning_rates = [alpha_W1,alpha_W1,alpha_W1,alpha_W2,alpha_W2] Delta_states = [epsilon * state_dot for state_dot in states_dot] Delta_params = [alpha * param_dot for alpha,param_dot in zip(learning_rates,params_dot)] states_new = [state+Delta for state,Delta in zip(self.states,Delta_states)] params_new = [param+Delta for param,Delta in zip(self.params,Delta_params)] updates_states = zip(self.states,states_new) updates_params = zip(self.params,params_new) # OUTPUTS FOR MONITORING error_rate = T.mean(T.neq(self.prediction, self.outside_world.y_data)) mse = T.mean(((self.y - self.outside_world.y_data_one_hot) ** 2).sum(axis=1)) norm_grad_hy = T.sqrt( (h_dot ** 2).mean(axis=0).sum() + (y_dot ** 2).mean(axis=0).sum() ) Delta_W1 = Delta_params[1] Delta_W2 = Delta_params[3] Delta_logW1 = T.sqrt( (Delta_W1 ** 2).mean() ) / T.sqrt( (self.W1 ** 2).mean() ) Delta_logW2 = T.sqrt( (Delta_W2 ** 2).mean() ) / T.sqrt( (self.W2 ** 2).mean() ) # THEANO FUNCTIONS iterative_function = theano.function( inputs=[lambda_x, lambda_y, epsilon, alpha_W1, alpha_W2], outputs=[self.energy, norm_grad_hy, self.prediction, error_rate, mse, Delta_logW1, Delta_logW2], updates=updates_params+updates_states ) relaxation_function = theano.function( inputs=[epsilon], outputs=[self.energy, norm_grad_hy, self.prediction, error_rate, mse], givens={ lambda_y: T.constant(0.) }, updates=updates_states[1:3] ) return iterative_function, relaxation_function
def find_Y(X_shared, Y_shared, sigma_shared, N, output_dims, n_epochs, initial_lr, final_lr, lr_switch, init_stdev, initial_momentum, final_momentum, momentum_switch, metric, verbose=0): """Optimize cost wrt Y""" # Optimization hyperparameters initial_lr = np.array(initial_lr, dtype=floath) final_lr = np.array(final_lr, dtype=floath) initial_momentum = np.array(initial_momentum, dtype=floath) final_momentum = np.array(final_momentum, dtype=floath) lr = T.fscalar('lr') lr_shared = theano.shared(initial_lr) momentum = T.fscalar('momentum') momentum_shared = theano.shared(initial_momentum) # Y velocities Yv = T.fmatrix('Yv') Yv_shared = theano.shared(np.zeros((N, output_dims), dtype=floath)) # Cost X = T.fmatrix('X') sigma = T.fvector('sigma') Y = T.fmatrix('Y') cost = cost_var(X, Y, sigma, metric) # Setting update for Y velocities grad_Y = T.grad(cost, Y) updates = [(Yv_shared, momentum*Yv - lr*grad_Y)] givens = {X: X_shared, sigma: sigma_shared, Y: Y_shared, Yv: Yv_shared, lr: lr_shared, momentum: momentum_shared} update_Yv = theano.function([], cost, givens=givens, updates=updates) # Setting update for Y givens = {Y: Y_shared, Yv: Yv_shared} updates = [(Y_shared, Y + Yv)] update_Y = theano.function([], [], givens=givens, updates=updates) # Momentum-based gradient descent for epoch in range(n_epochs): if epoch == lr_switch: lr_shared.set_value(final_lr) if epoch == momentum_switch: momentum_shared.set_value(final_momentum) c = update_Yv() update_Y() if verbose: print('Epoch: {0}. Cost: {1:.6f}.'.format(epoch + 1, float(c))) return np.array(Y_shared.get_value())
def train(self, data1, data2, similarities, miniBatchSize=20, epochs=200): nrMiniBatches = len(data1) / miniBatchSize miniBatchIndex = T.lscalar() momentum = T.fscalar() learningRate = T.fscalar() learningRateMiniBatch = np.float32(self.learningRate / miniBatchSize) print "learningRateMiniBatch in similarity net" print learningRateMiniBatch net = self._trainRBM(data1, data2) data1 = theano.shared(np.asarray(data1,dtype=theanoFloat)) data2 = theano.shared(np.asarray(data2,dtype=theanoFloat)) similarities = theano.shared(np.asarray(similarities,dtype=theanoFloat)) # The mini-batch data is a matrix x = T.matrix('x', dtype=theanoFloat) y = T.matrix('y', dtype=theanoFloat) self.x = x self.y = y z = T.vector('z', dtype=theanoFloat) trainer = Trainer(x, y, net) self.trainer = trainer # error = T.sum(T.sqr(trainer.output-z)) error = T.sum(T.nnet.binary_crossentropy(trainer.output, z)) updates = self.buildUpdates(trainer, error, learningRate, momentum) # Now you have to define the theano function discriminativeTraining = theano.function( inputs=[miniBatchIndex, learningRate, momentum], outputs=[trainer.output, trainer.cos], updates=updates, givens={ x: data1[miniBatchIndex * miniBatchSize:(miniBatchIndex + 1) * miniBatchSize], y: data2[miniBatchIndex * miniBatchSize:(miniBatchIndex + 1) * miniBatchSize], z: similarities[miniBatchIndex * miniBatchSize:(miniBatchIndex + 1) * miniBatchSize], }) for epoch in xrange(epochs): print "epoch", epoch momentum = np.float32(min(np.float32(0.5) + epoch * np.float32(0.1), np.float32(0.95))) for miniBatch in xrange(nrMiniBatches): output, cos = discriminativeTraining(miniBatch, learningRateMiniBatch, momentum) print trainer.w.get_value() print trainer.b.get_value()
def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): #print len(self.layers) #print [T.shape(l.W)[0] for l in self.layers] (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy #print T.shape(train_set_x), T.shape(train_set_y) index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) # compute list of fine-tuning updates updates = collections.OrderedDict() for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam*learning_rate for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] if self.max_col_norm is not None: for i in xrange(self.hidden_layers_number): W = self.layers[i].W if W in updates: updated_W = updates[W] col_norms = T.sqrt(T.sum(T.sqr(updated_W), axis=0)) desired_norms = T.clip(col_norms, 0, self.max_col_norm) updates[W] = updated_W * (desired_norms / (1e-7 + col_norms)) train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, givens={ self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) valid_fn = theano.function(inputs=[index], outputs=self.errors, givens={ self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) return train_fn, valid_fn
def __init__(self, input_dim, emb_dim, n_senses, W_w_f, lambdaH, lambdaL2, adjust, lambdaF): super().__init__(input_dim, emb_dim, n_senses, W_w_f, lambdaF) self.Wb = zeros((input_dim+1, n_senses), name="Wb") # sense- and word-specific bias self.H = TT.fscalar() # entropy self.L2 = TT.fscalar() self.lambdaH = lambdaH # weight for entropy regularizer self.lambdaL2 = lambdaL2 # weight for L2 regularizer if lambdaL2 == 0.: self.L2 = 0. else: self.L2 = TT.sum(TT.sqr(self.W_w)) + TT.sum(TT.sqr(self.W_c)) self.adjust = adjust
def __init__(self, game_params, arch_params, solver_params, trained_model, sn_dir): params=[None, None] if trained_model[0]: params[0] = common.load_params(trained_model[0]) if trained_model[1]: params[1] = common.load_params(trained_model[1]) self.lr_func = [] self.lr_func.append(create_learning_rate_func(solver_params['controler_0'])) self.lr_func.append(create_learning_rate_func(solver_params['controler_1'])) self.x_host_0 = tt.fvector('x_host_0') self.v_host_0 = tt.fvector('v_host_0') self.x_target_0 = tt.fvector('x_target_0') self.v_target_0 = tt.fvector('v_target_0') self.x_mines_0 = tt.fmatrix('x_mines_0') self.mines_map = tt.fmatrix('mines_map') self.time_steps = tt.fvector('time_steps') self.force = tt.fmatrix('force') self.n_steps_0 = tt.iscalar('n_steps_0') self.n_steps_1 = tt.iscalar('n_steps_1') self.lr = tt.fscalar('lr') self.goal_1 = tt.fvector('goal_1') self.trnsprnt = tt.fscalar('trnsprnt') self.rand_goals = tt.fmatrix('rand_goals') self.game_params = game_params self.arch_params = arch_params self.solver_params = solver_params self.sn_dir = sn_dir self.model = CONTROLLER(self.x_host_0, self.v_host_0, self.x_target_0, self.v_target_0, self.x_mines_0, self.mines_map, self.time_steps, self.force, self.n_steps_0, self.n_steps_1, self.lr, self.goal_1, self.trnsprnt, self.rand_goals, self.game_params, self.arch_params, self.solver_params, params)
def compile(self): """ compile theano functions """ self.t_L1_reg = T.fscalar('L1_reg') self.t_L2_reg = T.fscalar('L2_reg') self.t_learning_rate = T.fscalar('learning_rate') cost = self.loss + self.t_L1_reg * self.L1 + self.t_L2_reg * self.L2_sqr self.parameter_updates = [(param, param - self.t_learning_rate * T.grad(cost, param)) for param in self.params] self._tf_train = theano.function(inputs=[self.input, self.true_output, self.t_L1_reg, self.t_L2_reg, self.t_learning_rate], outputs=[self.loss], allow_input_downcast=True, updates=self.parameter_updates) self._tf_infer = theano.function(inputs=[self.input], outputs=[self.output], allow_input_downcast=True) self._tf_evaluate = theano.function(inputs=[self.input, self.true_output], outputs=[self.loss], allow_input_downcast=True)
def compile(self, cost, error_map_pyx, add_updates=[]): batch_idx = T.iscalar() learning_rate = T.fscalar() updates, norm_grad = self.hp.optimizer(cost, self.params.values(), lr=learning_rate) updates += add_updates self.outidx = {'cost':0, 'error_map_pyx':1, 'norm_grad':2} outputs = [cost, error_map_pyx] self.train = theano.function(inputs=[batch_idx, learning_rate], updates=updates, givens={ self.X:self.data['tr_X'][batch_idx * self.hp.batch_size : (batch_idx+1) * self.hp.batch_size], self.Y:self.data['tr_Y'][batch_idx * self.hp.batch_size : (batch_idx+1) * self.hp.batch_size]}, outputs=outputs + [norm_grad]) self.validate = theano.function(inputs=[batch_idx], givens={ self.X:self.data['va_X'][batch_idx * self.hp.test_batch_size : (batch_idx+1) * self.hp.test_batch_size], self.Y:self.data['va_Y'][batch_idx * self.hp.test_batch_size : (batch_idx+1) * self.hp.test_batch_size]}, outputs=outputs) self.test = theano.function(inputs=[batch_idx], givens={ self.X:self.data['te_X'][batch_idx * self.hp.test_batch_size : (batch_idx+1) * self.hp.test_batch_size], self.Y:self.data['te_Y'][batch_idx * self.hp.test_batch_size : (batch_idx+1) * self.hp.test_batch_size]}, outputs=outputs)
def get_SGD_trainer(self, debug=False): """ Returns a plain SGD minibatch trainer with learning rate as param. """ batch_x1 = T.fmatrix('batch_x1') batch_x2 = T.fmatrix('batch_x2') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use # compute the gradients with respect to the model parameters # using mean_cost so that the learning rate is not too dependent on the batch size cost = self.mean_cos_sim_cost gparams = T.grad(cost, self.params) # compute list of weights updates updates = OrderedDict() for param, gparam in zip(self.params, gparams): updates[param] = param - gparam * learning_rate outputs = cost if debug: outputs = [cost] + self.params + gparams +\ [updates[param] for param in self.params] train_fn = theano.function(inputs=[theano.Param(batch_x1), theano.Param(batch_x2), theano.Param(batch_y), theano.Param(learning_rate)], outputs=outputs, updates=updates, givens={self.x1: batch_x1, self.x2: batch_x2, self.y: batch_y}) return train_fn
def test_allow_downcast_floatX(self): a = tensor.fscalar('a') b = tensor.fvector('b') f = pfunc([a, b], (a + b), allow_input_downcast=True) g = pfunc([a, b], (a + b), allow_input_downcast=False) h = pfunc([a, b], (a + b), allow_input_downcast=None) # If the values can be accurately represented, OK assert numpy.all(f(0, [0]) == 0) assert numpy.all(g(0, [0]) == 0) assert numpy.all(h(0, [0]) == 0) # For the vector: OK iff allow_input_downcast is True assert numpy.allclose(f(0, [0.1]), 0.1) self.assertRaises(TypeError, g, 0, [0.1]) self.assertRaises(TypeError, h, 0, [0.1]) # For the scalar: OK if allow_input_downcast is True, # or None and floatX==float32 assert numpy.allclose(f(0.1, [0]), 0.1) self.assertRaises(TypeError, g, 0.1, [0]) if config.floatX == 'float32': assert numpy.allclose(h(0.1, [0]), 0.1) else: self.assertRaises(TypeError, h, 0.1, [0])
def ADAMopt(self, tVars, loss, lr, momentum=0): i = T.iscalar('i'); lr = T.fscalar('lr'); grads = T.grad(loss, tVars) '''ADAM Code from https://github.com/danfischetti/deep-recurrent-attentive-writer/blob/master/DRAW/adam.py ''' self.m = [theano.shared(name = 'm', \ value = np.zeros(param.get_value().shape,dtype=theano.config.floatX)) for param in model.params] self.v = [theano.shared(name = 'v', \ value = np.zeros(param.get_value().shape,dtype=theano.config.floatX)) for param in model.params] self.t = theano.shared(name = 't',value = np.asarray(1).astype(theano.config.floatX)) updates = [(self.t,self.t+1)] for param, gparam,m,v in zip(model.params, gparams, self.m, self.v): b1_t = 1-(1-beta1)*(l**(self.t-1)) m_t = b1_t*gparam + (1-b1_t)*m updates.append((m,m_t)) v_t = beta2*(gparam**2)+(1-beta2)*v updates.append((v,v_t)) m_t_bias = m_t/(1-(1-beta1)**self.t) v_t_bias = v_t/(1-(1-beta2)**self.t) if param.get_value().ndim == 1: updates.append((param,param - 5*lr*m_t_bias/(T.sqrt(v_t_bias)+epsilon))) else: updates.append((param,param - lr*m_t_bias/(T.sqrt(v_t_bias)+epsilon))) return theano.function([], loss, updates=updates)
def cmp(a_shp, b_shp): a = tensor.fmatrix() b = tensor.fmatrix() scalar = tensor.fscalar() av = my_rand(*a_shp) bv = my_rand(*b_shp) f = theano.function( [a, b], tensor.dot(a, b) * numpy.asarray(4, 'float32'), mode=mode_with_gpu) f2 = theano.function( [a, b], tensor.dot(a, b) * numpy.asarray(4, 'float32')) t = f.maker.fgraph.toposort() assert len(t) == 4 assert isinstance(t[0].op, tcn.GpuFromHost) assert isinstance(t[1].op, tcn.GpuFromHost) assert isinstance(t[2].op, tcn.blas.GpuDot22Scalar) assert isinstance(t[3].op, tcn.HostFromGpu) assert numpy.allclose(f(av, bv), f2(av, bv)) f = theano.function([a, b, scalar], tensor.dot(a, b) * scalar, mode=mode_with_gpu) f2 = theano.function([a, b, scalar], tensor.dot(a, b) * scalar) t = f.maker.fgraph.toposort() assert len(t) == 4 assert isinstance(t[0].op, tcn.GpuFromHost) assert isinstance(t[1].op, tcn.GpuFromHost) assert isinstance(t[2].op, tcn.blas.GpuDot22Scalar) assert isinstance(t[3].op, tcn.HostFromGpu) assert numpy.allclose(f(av, bv, 0.5), f2(av, bv, 0.5))
def get_SGD_trainer(self): """ Returns a plain SGD minibatch trainer with learning rate as param. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use # compute the gradients with respect to the model parameters # using mean_cost so that the learning rate is not too dependent # on the batch size gparams = T.grad(self.mean_cost, self.params) # compute list of weights updates updates = OrderedDict() for param, gparam in zip(self.params, gparams): if self.max_norm: W = param - gparam * learning_rate col_norms = W.norm(2, axis=0) desired_norms = T.clip(col_norms, 0, self.max_norm) updates[param] = W * (desired_norms / (1e-6 + col_norms)) else: updates[param] = param - gparam * learning_rate train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=self.mean_cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn
def __init__(self, name, path, learning_rate=0.001): self.r_symbol = T.fvector('r') self.gamma_symbol = T.fscalar('gamma') self.action_symbol = T.fmatrix('action') self.y_symbol = T.fvector('y') super(ReinforcementModel, self).__init__( name, path, learning_rate=learning_rate)
def __init__(self, n_in, n_classes, l2 = None): # Model W = 0.01*np.random.randn(n_in, n_classes).astype(dtype) b = 0.01*np.random.randn(n_classes).astype(dtype) self.W = theano.shared(W, name='W') self.b = theano.shared(b, name='b') self.params = [self.W, self.b] self.input = T.fmatrix('input') self.y_true = T.ivector('y_true') self.y_hat = T.nnet.softmax(T.dot(self.input, self.W) + self.b) # Train self.loglikelihood = -T.log(self.y_hat[T.arange(self.y_hat.shape[0]),self.y_true]) self.cost = T.mean(self.loglikelihood) if l2: for p in self.params: self.cost += l2*T.sum(p**2) self.gradients = T.grad(self.cost, self.params) self.lr = T.fscalar('lr') updates = [(p,p-self.lr*g) for p,g in zip(self.params, self.gradients)] self.train = theano.function(inputs=[self.input, self.y_true, self.lr], outputs=self.cost, updates = updates, allow_input_downcast=True) # Predict self.y_predict = T.argmax(self.y_hat, axis=1) self.predict = theano.function(inputs=[self.input], outputs=self.y_predict, allow_input_downcast=True)
def get_adagrad_trainer(self): """ Returns an Adagrad (Duchi et al. 2010) trainer using a learning rate. """ batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') learning_rate = T.fscalar('lr') # learning rate to use # compute the gradients with respect to the model parameters gparams = T.grad(self.mean_cost, self.params) # compute list of weights updates updates = OrderedDict() for accugrad, param, gparam in zip(self._accugrads, self.params, gparams): # c.f. Algorithm 1 in the Adadelta paper (Zeiler 2012) agrad = accugrad + gparam * gparam dx = - (learning_rate / T.sqrt(agrad + self._eps)) * gparam updates[param] = param + dx updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(learning_rate)], outputs=self.mean_cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn
def compile(self, log_pxz, log_qpz, cost, a_pxz): batch_idx = T.iscalar() learning_rate = T.fscalar() updates, norm_grad = self.hp.optimizer(cost, self.params.values(), lr=learning_rate) self.outidx = {'cost':0, 'cost_p':1, 'cost_q':2, 'norm_grad':3} outputs = [cost, log_pxz, log_qpz] self.train = theano.function(inputs=[batch_idx, learning_rate], givens={self.X:self.data['tr_X'][batch_idx * self.hp.batch_size : (batch_idx+1) * self.hp.batch_size]}, outputs=outputs + [norm_grad], updates=updates) self.validate = theano.function(inputs=[batch_idx], givens={self.X:self.data['tr_X'][batch_idx * self.hp.test_batch_size : (batch_idx+1) * self.hp.test_batch_size]}, outputs=outputs) self.test = theano.function(inputs=[batch_idx], givens={self.X:self.data['te_X'][batch_idx * self.hp.test_batch_size : (batch_idx+1) * self.hp.test_batch_size]}, outputs=outputs) n_samples = T.iscalar() if self.resample_z: self.data['ge_Z'] = srnd.normal((self.max_gen_samples, self.n_z), dtype=theano.config.floatX) else: self.data['ge_Z'] = shared(np.random.randn(self.max_gen_samples, self.n_z)) self.decode = theano.function(inputs=[n_samples], givens={self.Z:self.data['ge_Z'][:n_samples]}, outputs=a_pxz)
def __init__(self, t_layer_sizes, p_layer_sizes, dropout=0): self.t_layer_sizes = t_layer_sizes self.p_layer_sizes = p_layer_sizes # From our architecture definition, size of the notewise input self.t_input_size = 80 # time network maps from notewise input size to various hidden sizes self.time_model = StackedCells( self.t_input_size, celltype=LSTM, layers = t_layer_sizes) self.time_model.layers.append(PassthroughLayer()) # pitch network takes last layer of time model and state of last note, moving upward # and eventually ends with a two-element sigmoid layer p_input_size = t_layer_sizes[-1] + 2 self.pitch_model = StackedCells( p_input_size, celltype=LSTM, layers = p_layer_sizes) self.pitch_model.layers.append(Layer(p_layer_sizes[-1], 2, activation = T.nnet.sigmoid)) self.dropout = dropout self.conservativity = T.fscalar() self.srng = T.shared_randomstreams.RandomStreams(np.random.randint(0, 1024)) print "model-setup::Trace-1" self.setup_train() print "model-setup::Trace-2" self.setup_predict() print "model-setup::Trace-3" self.setup_slow_walk()
def test_copy_delete_updates(self): w = T.iscalar('w') x = T.fscalar('x') # SharedVariable for tests, one of them has update y = theano.shared(value=1, name='y') z = theano.shared(value=2, name='z') out = x + y + z # Test for different linkers # for mode in ["FAST_RUN","FAST_COMPILE"]: # second_time = False for mode in ["FAST_RUN", "FAST_COMPILE"]: ori = theano.function([x], out, mode=mode, updates={z: z * 2}) cpy = ori.copy(delete_updates=True) assert cpy(1)[0] == 4 assert cpy(1)[0] == 4 assert cpy(1)[0] == 4 # Test if unused implicit and explicit inputs from delete_updates # are ignored as intended. for mode in ["FAST_RUN", "FAST_COMPILE"]: ori = theano.function([x], x, mode=mode, updates={z: z * 2}) cpy = ori.copy(delete_updates=True) ori = theano.function([x, w], x, mode=mode, updates={z: z + w}) cpy = ori.copy(delete_updates=True)
def test_copy_share_memory(self): x = T.fscalar('x') # SharedVariable for tests, one of them has update y = theano.shared(value=1) z = theano.shared(value=2) out = T.tanh((x + y + 2) / (x + z - 0.2)**2) # Test for different linkers for mode in ["FAST_RUN", "FAST_COMPILE"]: ori = theano.function([x], [out], mode=mode, updates={z: z + 1}) cpy = ori.copy(share_memory=True) # Test if memories shared storage_map_ori = ori.fn.storage_map storage_map_cpy = cpy.fn.storage_map fgraph_cpy = cpy.maker.fgraph # Assert intermediate and Constants storages are shared. # and output stoarges are not shared i_o_variables = fgraph_cpy.inputs + fgraph_cpy.outputs ori_storages = storage_map_ori.values() l = [val for key, val in storage_map_cpy.items() if key not in i_o_variables or isinstance(key, theano.tensor.Constant)] for storage in l: self.assertTrue(any([storage is s for s in ori_storages])) # Assert storages of SharedVariable without updates are shared for (input, _1, _2), here, there in zip(ori.indices, ori.input_storage, cpy.input_storage): self.assertTrue(here.data is there.data)
def __build_backprop(self): y_init = self.outside_world.y_data_one_hot # initialize y=y_data h_init = my_op(2 * (T.dot(rho(y_init), self.W2.T) + self.bh)) # initialize h by backward propagation x_init = my_op(T.dot(rho(h_init), self.W1.T) + self.bx) # initialize x by backward propagation Delta_y = y_init - self.y Delta_h = h_init - self.h Delta_x = x_init - self.x by_dot = T.mean(Delta_y, axis=0) W2_dot = T.dot(self.rho_h.T, Delta_y) / T.cast(self.x.shape[0], dtype=theano.config.floatX) bh_dot = T.mean(Delta_h, axis=0) W1_dot = T.dot(self.rho_x.T, Delta_h) / T.cast(self.x.shape[0], dtype=theano.config.floatX) bx_dot = T.mean(Delta_x, axis=0) alpha = T.fscalar('alpha') by_new = self.by + alpha * by_dot W2_new = self.W2 + alpha * W2_dot bh_new = self.bh + alpha * bh_dot W1_new = self.W1 + alpha * W1_dot bx_new = self.bx + alpha * bx_dot updates_states = [(self.x, x_init), (self.h, h_init), (self.y, y_init)] updates_params = [(self.by, by_new), (self.W2, W2_new), (self.bh, bh_new), (self.W1, W1_new)] backprop = theano.function( inputs=[alpha], outputs=[], updates=updates_states+updates_params ) return backprop
def get_SAG_trainer(self, R=1., alpha=0., debug=False): # alpha for reg. TODO batch_x = T.fmatrix('batch_x') batch_y = T.ivector('batch_y') ind_minibatch = T.iscalar('ind_minibatch') n_seen = T.fscalar('n_seen') # compute the gradients with respect to the model parameters cost = self.mean_cost gparams = T.grad(cost, self.params) #sparams = T.grad(cost, self.pre_activations) # SAG specific scaling = numpy.float32(1. / (R / 4. + alpha)) updates = OrderedDict() for accugrad, gradient_memory, param, gparam in zip( self._accugrads, self._sag_gradient_memory, #self._accugrads, self._sag_gradient_memory[ind_minibatch.eval()], self.params, gparams): new = gparam + alpha * param agrad = accugrad + new - gradient_memory[ind_minibatch] # updates[gradient_memory[ind_minibatch]] = new updates[gradient_memory] = T.set_subtensor(gradient_memory[ind_minibatch], new) updates[param] = param - (scaling / n_seen) * agrad updates[accugrad] = agrad train_fn = theano.function(inputs=[theano.Param(batch_x), theano.Param(batch_y), theano.Param(ind_minibatch), theano.Param(n_seen)], outputs=cost, updates=updates, givens={self.x: batch_x, self.y: batch_y}) return train_fn
def __init__(self, input_dimensionality, output_dimensionality, params=None, learning_rate=0.0001, momentum=.25): self.input_dimensionality = input_dimensionality self.output_dimensionality = output_dimensionality self.learning_rate = learning_rate srng = theano.tensor.shared_randomstreams.RandomStreams(seed=1234) input_seq = T.fmatrix('input_seq') dropoutRate = T.fscalar('dropoutRate') if params is None: self.ff1 = FeedForwardLayer(input_seq, self.input_dimensionality, 2000, rng=srng, dropout_rate=dropoutRate) self.ff2 = FeedForwardLayer(self.ff1.output, 2000, 1000, rng=srng, dropout_rate=dropoutRate) self.ff3 = FeedForwardLayer(self.ff2.output, 1000, 800, rng=srng, dropout_rate=dropoutRate) self.rf = RecurrentLayer(self.ff3.output, 800, 500, False) # Forward layer self.rb = RecurrentLayer(self.ff3.output, 800, 500, True) # Backward layer # REVERSE THE BACKWARDS RECURRENT OUTPUTS IN TIME (from [T-1, 0] ===> [0, T-1] self.s = SoftmaxLayer(T.concatenate((self.rf.output, self.rb.output[::-1, :]), axis=1), 2*500, self.output_dimensionality) else: self.ff1 = FeedForwardLayer(input_seq, self.input_dimensionality, 2000, parameters=params[0], rng=srng, dropout_rate=dropoutRate) self.ff2 = FeedForwardLayer(self.ff1.output, 2000, 1000, parameters=params[1], rng=srng, dropout_rate=dropoutRate) self.ff3 = FeedForwardLayer(self.ff2.output, 1000, 800, parameters=params[2], rng=srng, dropout_rate=dropoutRate) self.rf = RecurrentLayer(self.ff3.output, 800, 500, False, parameters=params[3]) # Forward layer self.rb = RecurrentLayer(self.ff3.output, 800, 500, True, parameters=params[4]) # Backward layer # REVERSE THE BACKWARDS RECURRENT OUTPUTS IN TIME (from [T-1, 0] ===> [0, T-1] self.s = SoftmaxLayer(T.concatenate((self.rf.output, self.rb.output[::-1, :]), axis=1), 2*500, self.output_dimensionality, parameters=params[5]) self.probabilities = theano.function( inputs=[input_seq, dropoutRate], outputs=[self.s.output], allow_input_downcast=True )
def train_linreg(X_train, y_train, eta, epochs): costs = [] # Initialize arrays eta0 = T.fscalar('eta0') y = T.fvector(name='y') X = T.fmatrix(name='X') w = theano.shared(np.zeros( shape=(X_train.shape[1] + 1), dtype=theano.config.floatX), name='w') # calculate cost net_input = T.dot(X, w[1:]) + w[0] errors = y - net_input cost = T.sum(T.pow(errors, 2)) # perform gradient update gradient = T.grad(cost, wrt=w) update = [(w, w - eta0 * gradient)] # compile model train = theano.function(inputs=[eta0], outputs=cost, updates=update, givens={X: X_train, y: y_train}) for _ in range(epochs): costs.append(train(eta)) return costs, w
def __init__(self, wordMatrix, shape, filters, rfilter, features, poolSize, time, categories, static, dropoutRate, learningRate, useVal, name): ''' >>>initialize the model >>>type wordMatrix: matrix >>>para wordMatrix: input tensor >>>type shape: tuple or list of length 4 >>>para shape: (batchSize,feature maps,sentenceLen,dimension) >>>type filters: tuple or list of 2-len tuple or list >>>para filters: the size of filters in each layer >>>type rfilter: tuple or list of 2-len tuple or list >>>para rfilter: the size of recurrent connection in each layer >>>type features: tuple or list of int >>>para features: num of feature maps in each layer >>>type poolSize: tuple or list of 2-len tuple or list >>>para poolSize: pooling size of each layer >>>type time: int >>>para time: the iteration times of recurrent connection >>>type categories: int >>>para categories: target categories >>>type static: boolean >>>para static: static wordVec or not >>>type dropoutRate: tuple or list of float >>>para dropoutRate: dropout rate of each layer >>>type learningRate: float >>>para learningRate: learning rate >>>type useVal: bool >>>para useVal: whether or not to use validation set >>>type name: str >>>para name: the name of the model ''' self.learningRate = learningRate self.static = static self.name = name self.useVal = useVal self.batchSize, self.featureMaps, self.sentenceLen, self.wdim = shape self.categories = categories rng = np.random.RandomState(2011010539) self.x = T.matrix('x') self.y = T.ivector('y') self.lr = T.fscalar('lr') self.wordVec = theano.shared(wordMatrix, name='wordVec') input = self.wordVec[T.cast(self.x.flatten(), dtype='int32')].reshape(shape) self.deep = min(len(features), len(filters), len(poolSize)) self.layers = [] print 'This is a network of %i layer(s)' % self.deep for i in xrange(self.deep): if i == 0: layerSize = shape layerInput = input fmapIn = self.featureMaps else: layerSize = [ self.batchSize, features[i - 1], (self.layers[-1].shape[2] - filters[i - 1][0] + 1) / poolSize[i - 1][0], (self.layers[-1].shape[3] - filters[i - 1][1] + 1) / poolSize[i - 1][1] ] layerInput = self.layers[-1].output fmapIn = features[i - 1] newlayer = DropoutConvPool( rng=rng, input=layerInput, shape=layerSize, filters=[features[i], fmapIn, filters[i][0], filters[i][1]], pool=poolSize[i], dropout=dropoutRate[i]) self.layers.append(newlayer) classifierInputShape = [ self.batchSize, features[self.deep - 1], (self.layers[-1].shape[2] - filters[self.deep - 1][0] + 1) / poolSize[self.deep - 1][0], (self.layers[-1].shape[3] - filters[self.deep - 1][1] + 1) / poolSize[self.deep - 1][1] ] self.classifier = LogisticRegression( input=self.layers[-1].output.flatten(2), n_in=np.prod(classifierInputShape[1:]), n_out=categories) self.params = self.classifier.param for i in xrange(self.deep): self.params += self.layers[i].param if static == False: self.params += [self.wordVec] weights = 0 for item in self.classifier.param: weights += T.sum(T.sqr(item)) self.cost = self.classifier.negative_log_likelyhood(self.y) self.errors = self.classifier.errors(self.y) self.sgdUpdate = sgd(self.params, self.cost, self.lr) self.sgdMomentumUpdate = sgdMomentum(self.params, self.cost, self.lr) self.adadeltaUpdate = AdadeltaUpdate(self.params, self.cost, self.lr) self.adadeltaMomentumUpdate = AdadeltaMomentumUpdate( params=self.params, cost=self.cost, stepSize=self.lr) self.sgdDelta = self.plotUpdate(self.sgdUpdate) self.sgdMomentumDelta = self.plotUpdate(self.sgdMomentumUpdate) self.adadeltaDelta = self.plotUpdate(self.adadeltaUpdate) self.adadeltaMomentumDelta = self.plotUpdate( self.adadeltaMomentumUpdate) print 'model %s constructed!' % name
def train_lasagne(self, learning_rate_value=0.2, learning_rate_decay=0.9999, num_epochs=4000): # Load the dataset self.saved_params = [] print "Loading data..." learning_rate = T.fscalar('learning_rate') epoch = T.fscalar('epoch') # Create neural network model (depending on first command line parameter) print "Building model and compiling functions..." self.network = self.build_network(self.x) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(self.network['prob'], deterministic=False) loss = lasagne.objectives.categorical_crossentropy(prediction, self.y) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. params = lasagne.layers.get_all_params(self.network['prob'], trainable=True) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate, momentum=0.8) #updates = lasagne.updates.adam(loss,params,learning_rate=learning_rate_value, beta1=0.9, beta2=0.999, epsilon=1e-08) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(self.network['prob'], deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, self.y) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), self.y), dtype=theano.config.floatX) ## Masking the gradients if masks are defined #if not self.mask_weights is None: #for param in self.params[-4:-2]: #if param.name == 'W': #updates[param] *= self.mask_weights #elif param.name == 'b': #updates[param] *= self.mask_biases # Compile a function performing a training ste on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([self.x, self.y, learning_rate], loss, updates=updates, on_unused_input='ignore') val_fn = theano.function([self.x, self.y], [test_loss, test_acc], on_unused_input='ignore') # Loading the training and validation set # Loading the validation set self.prepare('valid') n_train_batches = self.nclasses * self.seq_per_class / self.batch_size n_valid_batches = self.nclasses * self.seq_per_class / self.batch_size # Finally, launch the training loop. print "Starting training..." # We iterate over epochs: self.best_validation_acc = -numpy.inf done_looping = False for epoch in range(num_epochs): self.prepare('train') # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in self.iterate_minibatches(self.mocap_train, self.labels_train, self.batch_size, shuffle=True): inputs, targets = batch train_err += train_fn(inputs, targets, learning_rate=learning_rate_value) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in self.iterate_minibatches(self.mocap_valid, self.labels_valid, self.batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 this_validation_acc = val_acc #self._print_results(this_validation_loss, ts, iter_, #learning_rate_value) if this_validation_acc > self.best_validation_acc: self.best_validation_acc = this_validation_acc f = open(self.filters_file, 'w') numpy.savez( f, *lasagne.layers.get_all_param_values(self.network['prob'])) f.close() learning_rate_value = learning_rate_value * learning_rate_decay print "Epoch= %d Learning rate = %3.4f Validation Accuracy= %3.3f" % ( epoch + 1, learning_rate_value, val_acc / val_batches * 100)
def test_ModelC_AllCNN(learning_rate=0.05, n_epochs=350, batch_size=200, L2_reg=0.001, input_ndo_p=0.8, layer_ndo_p=0.5, save_model=True, save_freq=50): """ :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type batch_size: int :param batch_size: the number of training examples per batch """ rng = numpy.random.RandomState(23455) datasets = load_data() train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_test_batches = test_set_x.get_value(borrow=True).shape[0] print 'n_train_batches: ', n_train_batches print 'n_valid_batches: ', n_valid_batches print 'n_test_batches: ', n_test_batches n_train_batches //= batch_size n_valid_batches //= batch_size n_test_batches //= batch_size learning_rate = numpy.asarray(learning_rate, dtype=numpy.float32) # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch lr = T.fscalar() training_enabled = T.iscalar('training_enabled') # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print('... building the model') layer0_input = x.reshape((batch_size, 3, 32, 32)) # drop the input only while training, don't drop while testing dropout_input = T.switch(T.neq(training_enabled, 0), drop(layer0_input, p=input_ndo_p), input_ndo_p * layer0_input) layer0 = myConvLayer(rng, is_train=training_enabled, input_data=dropout_input, filter_shape=(96, 3, 5, 5), image_shape=(batch_size, 3, 32, 32), ssample=(1, 1), bordermode='half', p=1.0) layer1 = myConvLayer(rng, is_train=training_enabled, input_data=layer0.output, filter_shape=(96, 96, 3, 3), image_shape=(batch_size, 96, 32, 32), ssample=(2, 2), bordermode='half', p=0.5) layer2 = myConvLayer(rng, is_train=training_enabled, input_data=layer1.output, filter_shape=(192, 96, 5, 5), image_shape=(batch_size, 96, 16, 16), ssample=(1, 1), bordermode='half', p=1.0) layer3 = myConvLayer(rng, is_train=training_enabled, input_data=layer2.output, filter_shape=(192, 192, 3, 3), image_shape=(batch_size, 192, 16, 16), ssample=(2, 2), bordermode='half', p=0.5) layer4 = myConvLayer(rng, is_train=training_enabled, input_data=layer3.output, filter_shape=(192, 192, 3, 3), image_shape=(batch_size, 192, 8, 8), ssample=(1, 1), bordermode='half', p=1.0) layer5 = myConvLayer(rng, is_train=training_enabled, input_data=layer4.output, filter_shape=(192, 192, 1, 1), image_shape=(batch_size, 192, 8, 8), ssample=(1, 1), bordermode='half', p=1.0) layer6 = myConvLayer(rng, is_train=training_enabled, input_data=layer5.output, filter_shape=(10, 192, 1, 1), image_shape=(batch_size, 192, 8, 8), ssample=(1, 1), bordermode='half', p=1.0) # make sure this is what global averaging does global_average = layer6.output.mean(axis=(2, 3)) softmax_layer = SoftmaxWrapper(input_data=global_average, n_in=10, n_out=10) L2_sqr = ((layer0.W**2).sum() + (layer1.W**2).sum() + (layer2.W**2).sum() + (layer3.W**2).sum() + (layer4.W**2).sum() + (layer5.W**2).sum() + (layer6.W**2).sum()) # the cost we minimize during training is the NLL of the model cost = (softmax_layer.negative_log_likelihood(y) + L2_reg * L2_sqr) # create a function to compute the mistakes that are made by the model test_model = theano.function( [index], softmax_layer.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size], training_enabled: numpy.cast['int32'](0) }) validate_model = theano.function( [index], softmax_layer.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size], training_enabled: numpy.cast['int32'](0) }) # create a list of all model parameters to be fit by gradient descent params = layer6.params + layer5.params + layer4.params + layer3.params + layer2.params + layer1.params + layer0.params # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. momentum = theano.shared(numpy.cast[theano.config.floatX](0.9), name='momentum') updates = [] for param in params: param_update = theano.shared(param.get_value() * numpy.cast[theano.config.floatX](0.)) updates.append((param, param - lr * param_update)) updates.append((param_update, momentum * param_update + (numpy.cast[theano.config.floatX](1.) - momentum) * T.grad(cost, param))) train_model = theano.function( [index, lr], cost, updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size], training_enabled: numpy.cast['int32'](1) }) # end-snippet-1 ############### # TRAIN MODEL # ############### print('... training') # early-stopping parameters # patience = 10000 # look as this many examples regardless # patience_increase = 2 # wait this much longer when a new best is found # improvement_threshold = 0.995 # a relative improvement of this much is considered significant # validation_frequency = min(n_train_batches, patience // 2) validation_frequency = n_train_batches // 2 best_validation_loss = numpy.inf best_iter = 0 test_score = 0. start_time = timeit.default_timer() epoch = 0 done_looping = False updateLRAfter = 200 while (epoch < n_epochs) and (not done_looping): # shuffle data before starting the epoch epoch = epoch + 1 if (epoch > updateLRAfter): learning_rate *= 0.1 updateLRAfter += 50 print 'epoch: ', epoch print 'updateLRAfter: ', updateLRAfter print 'learning_rate: ', learning_rate for minibatch_index in range(n_train_batches): #print 'epoch: {0}, minibatch: {1}'.format(epoch, minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if iter % 50 == 0: print('training @ iter = ', iter) cost_ij = train_model(minibatch_index, learning_rate) if (iter + 1) % validation_frequency == 0: # compute zero-one loss on validation set validation_losses = [ validate_model(i) for i in range(n_valid_batches) ] this_validation_loss = numpy.mean(validation_losses) print('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: #improve patience if loss improvement is good enough # if this_validation_loss < best_validation_loss * \ # improvement_threshold: # patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = [ test_model(i) for i in range(n_test_batches) ] test_score = numpy.mean(test_losses) print((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) # if patience <= iter: # done_looping = True # break if save_model and epoch % save_freq == 0: # add model name to the file to differentiate different models with gzip.open('parameters_epoch_{0}.pklz'.format(epoch), 'wb') as fp: cPickle.dump([param.get_value() for param in params], fp, protocol=2) end_time = timeit.default_timer() print('Optimization complete.') print( 'Best validation score of %f %% obtained at iteration %i, ' 'with test performance %f %%' % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)), sys.stderr)
def __init__(self, We_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden self.en_hidden_size = params.hidden_inf self.num_labels = params.num_labels self.de_hidden_size = params.de_hidden_size self.lstm_layers_num = 1 input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') target_var_in = T.imatrix(name='in_targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() length0 = T.iscalar() t_t = T.fscalar() t_t0 = T.fscalar() Wyy0 = np.random.uniform( -0.02, 0.02, (self.num_labels + 1, self.num_labels + 1)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * hidden)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=self.num_labels, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open( 'POS_CRF_lstm_pretrain.Batchsize_10_dropout_0_LearningRate_0.1_1e-050_emb_0.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) self.params = [] self.hos = [] self.Cos = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] ei, di, dt = T.imatrices(3) #place holders decoderInputs0, em, em1, dm, tf, di0 = T.fmatrices(6) #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform( self.de_hidden_size + 2 * self.en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) input_var_shuffle = input_var.dimshuffle(1, 0) mask_var_shuffle = mask_var.dimshuffle(1, 0) target_var_in_shuffle = target_var_in.dimshuffle(1, 0) target_var_shuffle = target_var.dimshuffle(1, 0) self.params += [self.linear, self.linear_bias, self.de_lookuptable] #concatenate state_below = We[input_var_shuffle.flatten()].reshape( (input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize)) enclstm_f = LSTM(embsize, self.en_hidden_size) enclstm_b = LSTM(embsize, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle) hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle) hs = T.concatenate([hs_f, hs_b], axis=2) Cs = T.concatenate([Cs_f, Cs_b], axis=2) hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), Encoder = hs ei, di, dt = T.imatrices(3) #place holders em, dm, tf, di0 = T.fmatrices(4) self.encoder_function = theano.function(inputs=[ei, em], outputs=Encoder, givens={ input_var: ei, mask_var: em }) state_below = self.de_lookuptable[ target_var_in_shuffle.flatten()].reshape( (target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co) decoder_lstm_outputs = T.concatenate([state_below, Encoder], axis=2) linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, :] softmax_outputs, updates = theano.scan( fn=lambda x: T.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * T.log(pred[T.arange(input_var.shape[0]), y]) def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = T.cast(state_.argmax(axis=-1), "int32") msk_ = T.fill((T.zeros_like(token_idxs, dtype="float32")), 1) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs) state_below0 = state_below0.reshape( (ctx_.shape[0], self.de_hidden_size)) state_below0 = T.concatenate([ctx_, state_below0], axis=1) newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = T.nnet.softmax(newpred) extra_p = T.zeros_like(hs[:, :, 0]) state_below = T.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs ctx_0, state_0 = T.fmatrices(2) hs_0 = T.ftensor3() Cs_0 = T.ftensor3() state_below_tmp, hs_tmp, Cs_tmp = _step2(ctx_0, state_0, hs_0, Cs_0) self.f_next = theano.function([ctx_0, state_0, hs_0, Cs_0], [state_below_tmp, hs_tmp, Cs_tmp], name='f_next') hs0, Cs0 = T.as_tensor_variable( self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=input_var_shuffle.shape[0]) predy = train_outputs[0].dimshuffle(1, 0, 2) predy = predy[:, :, :-1] * mask_var[:, :, None] predy0 = predy.reshape((-1, self.num_labels)) def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output(l_local, { l_in_word: input_var, l_mask_word: mask_var }) local_energy = local_energy.reshape((-1, length, self.num_labels)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var}) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, self.num_labels) A = A.reshape((-1, length, self.num_labels)) #predy = predy0.reshape((-1, length, 25)) #predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) # compute the ground-truth energy targets_shuffled0 = A.dimshuffle(1, 0, 2) target_time00 = targets_shuffled0[0] initial_energy00 = T.dot(target_time00, Wyy[-1, :-1]) initials0 = [target_time00, initial_energy00] [_, target_energies0], _ = theano.scan( fn=inner_function, outputs_info=initials0, sequences=[targets_shuffled0[1:], masks_shuffled[1:]]) cost110 = target_energies0[-1] + T.sum( T.sum(local_energy * A, axis=2) * mask_var, axis=1) #predy_f = predy.reshape((-1, 25)) y_f = target_var.flatten() if (params.annealing == 0): lamb = params.L3 elif (params.annealing == 1): lamb = params.L3 * (1 - 0.01 * t_t) if (params.regutype == 0): ce_hinge = lasagne.objectives.categorical_crossentropy( predy0 + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * mask_var, axis=1) cost = T.mean(-cost11) + lamb * T.mean(ce_hinge) else: entropy_term = -T.sum(predy0 * T.log(predy0 + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * mask_var, axis=1) cost = T.mean(-cost11) - lamb * T.mean(entropy_term) """ f = open('F0_simple.pickle') PARA = pickle.load(f) f.close() l2_term = sum(lasagne.regularization.l2(x-PARA[index]) for index, x in enumerate(a_params)) cost = T.mean(-cost11) + params.L2*l2_term """ #from adam import adam #updates_a = adam(cost, self.params, params.eta) #updates_a = lasagne.updates.sgd(cost, self.params, params.eta) #updates_a = lasagne.updates.apply_momentum(updates_a, self.params, momentum=0.9) from momentum import momentum updates_a = momentum(cost, self.params, params.eta, momentum=0.9) if (params.regutype == 0): self.train_fn = theano.function( inputs=[ei, dt, em, em1, length0, t_t0, di0], outputs=[cost, ce_hinge], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, ce_hinge], updates = updates_a, on_unused_input='ignore') else: self.train_fn = theano.function( inputs=[ei, dt, em, em1, length0, t_t0, di0], outputs=[cost, entropy_term], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore') prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( inputs=[ei, dt, em, em1, length0, di0], outputs=[cost11, cost110, corr_train, num_tokens, prediction], on_unused_input='ignore', givens={ input_var: ei, target_var: dt, mask_var: em, mask_var1: em1, length: length0, decoderInputs0: di0 })
def build_finetune_functions(self, train_shared_xy, valid_shared_xy, batch_size): (train_set_x, train_set_y) = train_shared_xy (valid_set_x, valid_set_y) = valid_shared_xy # compute number of minibatches for training, validation and testing n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] n_valid_batches /= batch_size index = T.lscalar('index') # index to a [mini]batch learning_rate = T.fscalar('learning_rate') momentum = T.fscalar('momentum') layer_size = len(self.params) lr_list = [] for i in xrange(layer_size): lr_list.append(learning_rate) ##top 2 layers use a smaller learning rate if layer_size > 4: for i in range(layer_size-4, layer_size): lr_list[i] = learning_rate * 0.5 # compute list of fine-tuning updates # compute the gradients with respect to the model parameters gparams = T.grad(self.finetune_cost, self.params) if self.use_rprop == 0: updates = OrderedDict() layer_index = 0 for dparam, gparam in zip(self.delta_params, gparams): updates[dparam] = momentum * dparam - gparam * lr_list[layer_index] layer_index += 1 for dparam, param in zip(self.delta_params, self.params): updates[param] = param + updates[dparam] train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, on_unused_input='ignore', givens={self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) elif self.use_rprop: updates = compile_RPROP_train_function(self, gparams) ## retain learning rate and momentum to make interface backwards compatible, ## but we won't use them, means we have to use on_unused_input='warn'. ## Otherwise same function for RPROP or otherwise -- can move this block outside if clause. train_fn = theano.function(inputs=[index, theano.Param(learning_rate, default = 0.0001), theano.Param(momentum, default = 0.5)], outputs=self.errors, updates=updates, on_unused_input='warn', givens={self.x: train_set_x[index * batch_size: (index + 1) * batch_size], self.y: train_set_y[index * batch_size: (index + 1) * batch_size]}) valid_fn = theano.function([], outputs=self.errors, on_unused_input='ignore', givens={self.x: valid_set_x, self.y: valid_set_y}) valid_score_i = theano.function([index], outputs=self.errors, on_unused_input='ignore', givens={self.x: valid_set_x[index * batch_size: (index + 1) * batch_size], self.y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) # Create a function that scans the entire validation set def valid_score(): return [valid_score_i(i) for i in xrange(n_valid_batches)] return train_fn, valid_fn
l1_6 = ConvKeepLayer(rng, l1_5a, (2 * OUT + 8, 24 + 2, 5, 5)) l1_7 = ConvKeepLayer(rng, l1_6, (OUT + 1, 2 * OUT + 8, 9, 9), Nonlinear=False) l1_7sm = LogSoftmaxLayer(l1_7) lout = LabelLoss(l1_7sm, l1) model = Model(l1, l2, l4, l8, l1_2, l2_2, l4_2, l8_2, l1_2s, l2_2s, l4_2s, l2_2e, l4_2e, l8_2e, l1_2a, l2_2a, l4_2a, l8_2a, l1_3, l2_3, l4_3, l8_3, l1_3s, l2_3s, l4_3s, l2_3e, l4_3e, l8_3e, l1_3a, l2_3a, l4_3a, l8_3a, l1_4, l2_4, l4_4, l8_4, l1_4s, l2_4s, l4_4s, l2_4e, l4_4e, l8_4e, l1_4a, l2_4a, l4_4a, l8_4a, l1_5, l2_5, l4_5, l8_5, l8_5e, l4_5a, l4_6, l4_6e, l2_5a, l2_6, l2_6e, l1_5a, l1_6, l1_7, l1_7sm, lout) a, b = T.fscalar(), T.fscalar() obinary, _tp, _fp, _tn, _fn, _F = binaryloss_label(l1_7.output, lout.output, 0, a, b) #4.6, 1.41) cost = lout.loss params = model.params() momentums = model.pmomentum() grads = T.grad(cost, params) updates = [] updating = 0.0 for grad, momentum in zip(grads, momentums): updates.append((momentum, MOMENTUM * momentum - LEARN_RATE * grad)) updating = updating + T.sum(abs(momentum))
def __init__(self, dim_z, x_train, x_test, diff=None, magic=5000): ####################################### SETTINGS ################################### self.x_train = x_train self.x_test = x_test self.diff = diff self.batch_size = 100. self.learning_rate = theano.shared(np.float32(0.0008)) self.momentum = 0.3 self.performance = {"train": [], "test": []} self.inpt = T.ftensor4(name='input') self.df = T.fmatrix(name='differential') self.dim_z = dim_z self.generative_z = theano.shared(np.float32(np.zeros([1, dim_z]))) self.activation = relu self.generative = False self.out_distribution = False #self.y = T.matrix(name="y") self.in_filters = [5, 5, 5] self.filter_lengths = [10., 10., 10.] self.params = [] #magic = 73888. self.magic = magic self.dropout_symbolic = T.fscalar() self.dropout_prob = theano.shared(np.float32(0.0)) ####################################### LAYERS ###################################### # LAYER 1 ############################## self.conv1 = one_d_conv_layer(self.inpt, self.in_filters[0], 1, self.filter_lengths[0], param_names=["W1", 'b1']) self.params += self.conv1.params self.bn1 = batchnorm(self.conv1.output) self.nl1 = self.activation(self.bn1.X) self.maxpool1 = ds.max_pool_2d(self.nl1, [3, 1], st=[2, 1], ignore_border=False).astype( theano.config.floatX) self.layer1_out = dropout(self.maxpool1, self.dropout_symbolic) #self.layer1_out = self.maxpool1 # LAYER2 ################################ self.flattened = T.flatten(self.layer1_out, outdim=2) # Variational Layer ##################### self.latent_layer = variational_gauss_layer(self.flattened, self.magic, dim_z) self.params += self.latent_layer.params self.latent_out = self.latent_layer.output # Hidden Layer ######################### self.hidden_layer = hidden_layer(self.latent_out, dim_z, self.magic) self.params += self.hidden_layer.params self.hid_out = dropout( self.activation(self.hidden_layer.output).reshape( (self.inpt.shape[0], self.in_filters[-1], int(self.magic / self.in_filters[-1]), 1)), self.dropout_symbolic) # Devonvolutional 1 ###################### self.deconv1 = one_d_deconv_layer(self.hid_out, 1, self.in_filters[2], self.filter_lengths[2], pool=2., param_names=["W3", 'b3'], distribution=False) self.params += self.deconv1.params #self.nl_deconv1 = dropout(self.activation(self.deconv1.output),self.dropout_symbolic) self.tanh_out = self.deconv1.output self.last_layer = self.deconv1 if self.out_distribution == True: self.trunk_sigma = self.last_layer.log_sigma[:, :, :self.inpt. shape[2], :] self.trunc_output = self.tanh_out[:, :, :self.inpt.shape[2], :] ################################### FUNCTIONS ###################################################### self.get_latent_states = theano.function( [self.inpt], self.latent_out, givens=[[self.dropout_symbolic, self.dropout_prob]]) #self.prior_debug = theano.function([self.inpt],[self.latent_out,self.latent_layer.mu_encoder,self.latent_layer.log_sigma_encoder,self.latent_layer.prior]) #self.get_prior = theano.function([self.inpt],self.latent_layer.prior) #self.convolve1 = theano.function([self.inpt],self.layer1_out) #self.convolve2 = theano.function([self.inpt],self.layer2_out) self.output = theano.function( [self.inpt], self.trunc_output, givens=[[self.dropout_symbolic, self.dropout_prob]]) self.get_flattened = theano.function( [self.inpt], self.flattened, givens=[[self.dropout_symbolic, self.dropout_prob]]) #self.deconvolve1 = theano.function([self.inpt],self.deconv1.output) #self.deconvolve2 = theano.function([self.inpt],self.deconv2.output) #self.sig_out = theano.function([self.inpt],T.flatten(self.trunk_sigma,outdim=2)) self.output = theano.function( [self.inpt], self.trunc_output, givens=[[self.dropout_symbolic, self.dropout_prob]]) #self.generate_from_z = theano.function([self.inpt],self.trunc_output,givens = [[self.latent_out,self.generative_z]]) self.generate_from_z = theano.function( [self.inpt], self.trunc_output, givens=[[self.dropout_symbolic, self.dropout_prob], [self.latent_out, self.generative_z]]) self.cost = self.MSE() self.mse = self.MSE() #self.likelihood = self.log_px_z() #self.get_cost = theano.function([self.inpt],[self.cost,self.mse]) #self.get_likelihood = theano.function([self.layer1.inpt],[self.likelihood]) self.derivatives = T.grad(self.cost, self.params) #self.get_gradients = theano.function([self.inpt],self.derivatives) self.updates = adam(self.params, self.derivatives, self.learning_rate) #self.updates =momentum_update(self.params,self.derivatives,self.learning_rate,self.momentum) self.train_model = theano.function( inputs=[self.inpt, self.df], outputs=self.cost, updates=self.updates, givens=[[self.dropout_symbolic, self.dropout_prob]])
import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D import theano import theano.tensor as T kappa = T.fscalar() rho = T.fscalar() beta1 = T.fscalar() beta2 = T.fscalar() h = T.fscalar() N = T.iscalar() x = T.fvector() def f(X): X_ = T.zeros_like(X) X_ = T.set_subtensor(X_[1], (1.0 - T.exp(X[0])) * kappa) X_ = T.set_subtensor(X_[2], -T.exp(X[0]) * rho * (beta2 * X[0] + X[2])) X_ = T.set_subtensor( X_[0], T.exp(X[0]) * ((beta1 - 1.0) * X[0] + X[1] - X[2]) + X_[1] - X_[2]) return X_ def step(X): k1 = h * f(X) k2 = h * f(X + 0.5 * k1) k3 = h * f(X + 0.5 * k2)
#!/usr/bin/python from matplotlib import rc from pylab import * from theano import * import theano.tensor as T import numpy rc('text', usetex=True) rc('font', family='serif') x = T.fvector('x') x1 = T.fscalar('x1') y = 1/(1 + T.exp(-x)) y1 = 1/(1 + T.exp(-x1)) logistic = function([x], y) logistic1 = function([x1], y1) grady = T.grad(y1, x1) derivada = function([x1], grady) a = float(input('Introduce el extremo izqdo. \n')) b = float(input('Introduce el extremo drcho. \n')) particion = float(input('Introduce la longitud de particion del intervalo. \n')) pderiv = float(input('Introduce el punto donde hallar su recta tangente. \n')) xval = arange(a,b,particion, dtype='float32') z,w,w1=T.fscalars('z', 'w', 'w1') rectatg2 = (x-z)*w+w1 rectatg3 = function([x, Param(z, default=pderiv), Param(w, default=derivada(pderiv)), Param(w1, default=logistic1(pderiv))], rectatg2) figure(1)
#tempens model variables: z_target_var = T.matrix('z_targets') mask_train = T.vector('mask_train') unsup_weight_var = T.scalar('unsup_weight') learning_rate_var = T.scalar('learning_rate') adam_beta1_var = T.scalar('adam_beta1') # #Left sdp length # left_sdp_length=T.imatrix('left_sdp_length') # #Sentences length # sen_length=T.imatrix('sen_length') #negative loss negative_loss_alpha=T.fvector("negative_loss_alpha") negative_loss_lamda=T.fscalar("negative_loss_lamda") """ 2. Bulit GRU network ADAM """ gru_network,l_in,l_mask,l_gru_forward,l_split_cnn=model.bulit_gru(input_var,mask_var) #mask_train_input: where "1" is pass. where "0" isn't pass. mask_train_input=pro_data.mask_train_input(training_label,num_labels=model.num_labels) # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(gru_network) l_gru = lasagne.layers.get_output(l_gru_forward)
def __init__(self, x_dim, hidden_dim, y_dim, w_spread, p_drop): # parameters of the model self.wx = theano.shared( name="wx", value=w_spread * np.random.uniform(-1., 1., (x_dim + hidden_dim + 1, hidden_dim)).astype( theano.config.floatX), borrow=True) self.hx_0 = theano.shared(name="hx_0", value=np.zeros(hidden_dim, dtype=theano.config.floatX), borrow=True) self.w1 = theano.shared( name="w1", value=w_spread * np.random.uniform(-1., 1., (hidden_dim + hidden_dim + 1, hidden_dim)).astype(theano.config.floatX), borrow=True) self.h1_0 = theano.shared(name="h1_0", value=np.zeros(hidden_dim, dtype=theano.config.floatX), borrow=True) self.wy = theano.shared( name="wy", value=w_spread * np.random.uniform( -1., 1., (hidden_dim + 1, y_dim)).astype(theano.config.floatX), borrow=True) # bundle #self.params = [self.wx, self.hx_0, self.w1, self.h1_0, self.wy] self.params = [self.wx, self.w1, self.wy] # define recurrent neural network # (for each input word predict all output tags) x = T.fmatrix("x") y = T.fmatrix("y") learn_rate = T.fscalar('learn_rate') activation = T.tanh #activation = T.nnet.sigmoid #activation = lambda x: x * (x > 0) # reLU #activation = lambda x: x * ((x > 0) + 0.01) #activation = lambda x: T.minimum(x * (x > 0), 6) # capped reLU def model(x, wx, hx_0, w1, h1_0, wy, p_drop): def recurrence(x_cur, hx_prev, h1_prev, masks): one = np.float32(1.) hx = activation( T.dot(T.concatenate([x_cur, hx_prev, [one]]), wx)) hx_ = dropout_apply(hx, masks[0], p_drop) h1 = activation(T.dot(T.concatenate([hx_, h1_prev, [one]]), w1)) h1_ = dropout_apply(h1, masks[1], p_drop) y_pred = activation(T.dot(T.concatenate([h1_, [one]]), wy)) return (hx, h1, y_pred) if p_drop > 0.: masks = dropout_masks(p_drop, [hx_0.shape, h1_0.shape]) else: masks = [] (_, _, y_pred), _ = theano.scan(fn=recurrence, sequences=x, non_sequences=[masks], outputs_info=[hx_0, h1_0, None], n_steps=x.shape[0]) return y_pred y_pred = model(x, self.wx, self.hx_0, self.w1, self.h1_0, self.wy, 0.) y_noise = model(x, self.wx, self.hx_0, self.w1, self.h1_0, self.wy, p_drop) #loss = lambda y_pred, y: T.mean((y_pred - y) ** 2) # MSE #loss = lambda y_pred, y: T.sum((y_pred - y) ** 16) ** (1./16) #loss = lambda y_pred, y: T.max((y_pred - y) ** 2) loss = lambda y_pred, y: T.max(abs(y - y_pred)) + T.mean( (y - y_pred)**2) #loss = lambda y_pred, y: T.sum((y_pred - y) ** 16) ** (1./16) + T.mean((y - y_pred) ** 2) l1_reg = 0.001 l1 = T.mean(self.wx) + T.mean(self.w1) + T.mean(self.wy) l2_reg = 0.001 l2 = T.mean(self.wx**2) + T.mean(self.w1**2) + T.mean(self.wy**2) # define gradients and updates cost = loss(y_noise, y) + l1_reg * l1 + l2_reg * l2 #updates = sgd(cost, self.params, learn_rate) #updates = rmsprop(cost, self.params, learn_rate) updates = adam(cost, self.params, learn_rate) # compile theano functions self.predict = theano.function(inputs=[x], outputs=y_pred) self.train = theano.function( inputs=[x, y, learn_rate], outputs=[cost, T.min(y_noise), T.max(y_noise), T.mean(y_noise)], updates=updates)
def train_validate_test(self, trainSet, validateSet, testSet, nEpoch): ''' >>>train and test the model >>>type trainSet/validateSet/testSet: dict >>>para trainSet/validateSet/testSet: train/validate/test set >>>type nEpoch: int >>>para nEpoch: maximum iteration epoches ''' trainSize = trainSet['x'].shape[0] validateSize = validateSet['x'].shape[0] testSize = testSet['x'].shape[0] trainX = theano.shared(trainSet['x'], borrow=True) trainY = theano.shared(trainSet['y'], borrow=True) trainY = T.cast(trainY, 'int32') validateX = theano.shared(validateSet['x'], borrow=True) validateY = theano.shared(validateSet['y'], borrow=True) validateY = T.cast(validateY, 'int32') testX = testSet['x'] testY = np.asarray(testSet['y'], 'int32') trainBatches = trainSize / self.batchSize validateBatches = validateSize / self.batchSize index = T.iscalar('index') learnRate = T.fscalar('lr') stepSize = T.fscalar('lr') sgdTrainModel = theano.function( [index, learnRate], [self.cost, self.sgdDelta[0], self.sgdDelta[1], self.sgdDelta[2]], updates=self.sgdUpdate, givens={ self.x: trainX[index * self.batchSize:(index + 1) * self.batchSize], self.y: trainY[index * self.batchSize:(index + 1) * self.batchSize], self.lr: learnRate }) print 'SGD TrainModel Constructed!' sgdMomentumTrainModel = theano.function( [index, learnRate], [ self.cost, self.sgdMomentumDelta[0], self.sgdMomentumDelta[1], self.sgdMomentumDelta[2] ], updates=self.sgdMomentumUpdate, givens={ self.x: trainX[index * self.batchSize:(index + 1) * self.batchSize], self.y: trainY[index * self.batchSize:(index + 1) * self.batchSize], self.lr: learnRate }) print 'SGD-Momentum TrainModel Constructed!' adadeltaTrainModel = theano.function( [index, stepSize], [ self.cost, self.adadeltaDelta[0], self.adadeltaDelta[1], self.adadeltaDelta[2] ], updates=self.adadeltaUpdate, givens={ self.x: trainX[index * self.batchSize:(index + 1) * self.batchSize], self.y: trainY[index * self.batchSize:(index + 1) * self.batchSize], self.lr: stepSize }) print 'Adadelta TrainModel Constructed!' adadeltaMomentumTrainModel = theano.function( [index, stepSize], [ self.cost, self.adadeltaMomentumDelta[0], self.adadeltaMomentumDelta[1], self.adadeltaMomentumDelta[2] ], updates=self.adadeltaMomentumUpdate, givens={ self.x: trainX[index * self.batchSize:(index + 1) * self.batchSize], self.y: trainY[index * self.batchSize:(index + 1) * self.batchSize], self.lr: stepSize }) print 'Adadelta(with momentum) TrainModel Constructed!' validateModel = theano.function( [index], self.errors, givens={ self.x: validateX[index * self.batchSize:(index + 1) * self.batchSize], self.y: validateY[index * self.batchSize:(index + 1) * self.batchSize] }) print 'Validation Model Constructed!' testTrain = theano.function( [index], [self.cost, self.errors], givens={ self.x: trainX[index * self.batchSize:(index + 1) * self.batchSize], self.y: trainY[index * self.batchSize:(index + 1) * self.batchSize] }) print 'Test Model on Training Set Constructed!' testInput = self.wordVec[T.cast(self.x.flatten(), dtype='int32')].reshape( (testSize, self.featureMaps, self.sentenceLen, self.wdim)) testOutput = 0 for i in xrange(self.deep): testOutput = self.layers[i].process(testInput, testSize) testInput = testOutput testClassifierInput = testInput.flatten(2) testPredict = self.classifier.predictInstance(testClassifierInput) testError = T.mean(T.neq(testPredict, self.y)) testModel = theano.function([self.x, self.y], [testPredict, testError]) print 'Testing Model Constructed!' epoch = 0 maxEpoch = 5.0 learningRate = self.learningRate steppingSize = 1.0 localOpt = 0 bestTestAcc = 0.0 bestValAcc = 0.0 finalAcc = 0.0 self.trainAccs = [] self.validateAccs = [] self.testAccs = [] self.costValues = [] self.result = {'minError': 1.00, 'finalAcc': 0.00, 'bestValAcc': 0.00} testPredict = np.zeros(shape=(testSize, ), dtype='int32') testMatrix = np.zeros(shape=(self.categories, self.categories), dtype='int32') while epoch < nEpoch: epoch += 1 num = 0 for minBatch in np.random.permutation(range(trainBatches)): cost, dmax, dmin, dmean = adadeltaTrainModel( minBatch, learningRate) #cost=sgdMomentumTrainModel(minBatch,self.learningRate) #adadeltaMomentumTrainModel(minBatch,self.learningRate) x = float(epoch) + float(num + 1) / float(trainBatches) - 1 if num % 50 == 0: trainResult = [testTrain(i) for i in xrange(trainBatches)] trainCost, trainError = np.mean(trainResult, axis=0) trainAcc = 1 - trainError self.costValues.append({'x': x, 'value': trainCost}) self.trainAccs.append({'x': x, 'acc': trainAcc}) if self.useVal: validateError = [ validateModel(i) for i in xrange(validateBatches) ] validateAcc = 1 - np.mean(validateError) self.validateAccs.append({'x': x, 'acc': validateAcc}) print 'Epoch=%i,Num=%i,TrainAcc=%f%%,ValidateAcc=%f%%' % ( epoch, num, trainAcc * 100., validateAcc * 100.) else: print 'Epoch=%i,Num=%i,TrainAcc=%f%%' % ( epoch, num, trainAcc * 100.) print 'costValue=%f, learningRate=%f' % (trainCost, self.learningRate) testPredict, testError = testModel(testX, testY) assert len(testPredict) == len(testY) testMatrix = np.zeros(shape=(self.categories, self.categories), dtype='int32') for case in xrange(len(testY)): testMatrix[testY[case], testPredict[case]] += 1 testAcc = 1 - testError self.testAccs.append({'x': x, 'acc': testAcc}) print 'TestAcc=%f%%' % (testAcc * 100.) if self.useVal and validateAcc > bestValAcc: bestValAcc = validateAcc bestTestAcc = max(bestTestAcc, testAcc) finalAcc = testAcc localOpt = 0 maxEpoch = max(maxEpoch, epoch * 1.5) self.result = { 'minError': 1 - bestTestAcc, 'finalAcc': finalAcc, 'bestValAcc': bestValAcc } elif not self.useVal: bestTestAcc = max(bestTestAcc, testAcc) finalAcc = testAcc self.result = { 'minError': 1 - bestTestAcc, 'finalAcc': finalAcc, 'bestValAcc': bestValAcc } print 'BestValAcc=%f%%,BestTestAcc=%f%%,FinalAcc=%f%%' % ( bestValAcc * 100., bestTestAcc * 100., finalAcc * 100.) num += 1 x = float(epoch) trainResult = [testTrain(i) for i in xrange(trainBatches)] trainCost, trainError = np.mean(trainResult, axis=0) trainAcc = 1 - trainError self.costValues.append({'x': x, 'value': trainCost}) self.trainAccs.append({'x': x, 'acc': trainAcc}) if self.useVal: validateError = [ validateModel(i) for i in xrange(validateBatches) ] validateAcc = 1 - np.mean(validateError) self.validateAccs.append({'x': x, 'acc': validateAcc}) print 'Epoch=%i,TrainAcc=%f%%,ValidateAcc=%f%%' % ( epoch, trainAcc * 100., validateAcc * 100.) else: print 'Epoch=%i,TrainAcc=%f%%' % (epoch, trainAcc * 100.) print 'costValue=%f, learningRate=%f' % (trainCost, self.learningRate) testPredict, testError = testModel(testX, testY) assert len(testY) == len(testPredict) testMatrix = np.zeros(shape=(self.categories, self.categories), dtype='int32') for case in xrange(len(testY)): testMatrix[testY[case], testPredict[case]] += 1 testAcc = 1 - testError self.testAccs.append({'x': x, 'acc': testAcc}) print 'TestAcc=%f%%' % (testAcc * 100.) if self.useVal and validateAcc > bestValAcc: bestValAcc = validateAcc bestTestAcc = max(bestTestAcc, testAcc) finalAcc = testAcc localOpt = 0 maxEpoch = max(maxEpoch, epoch * 1.5) self.result = { 'minError': 1 - bestTestAcc, 'finalAcc': finalAcc, 'bestValAcc': bestValAcc } elif not self.useVal: bestTestAcc = max(bestTestAcc, testAcc) finalAcc = testAcc self.result = { 'minError': 1 - bestTestAcc, 'finalAcc': finalAcc, 'bestValAcc': bestValAcc } print 'BestValAcc=%f%%,BestTestAcc=%f%%,FinalAcc=%f%%' % ( bestValAcc * 100., bestTestAcc * 100., finalAcc * 100.) testPredictInfo = { 'testPredict': testPredict, 'predictMatrix': testMatrix } return testPredictInfo, finalAcc
def __init__(self, params, num_lables, num_features): self.textfile = open(params.outfile, 'w') hidden1 = params.hidden1 hidden2 = params.hidden2 hidden1_a = params.hidden1_a hidden2_a = params.hidden2_a eta = params.eta L2 = params.L2 C1 = params.C1 ## for the local energy function l_in = lasagne.layers.InputLayer((None, num_features)) l_y1 = lasagne.layers.DenseLayer(l_in, hidden1) l_y2 = lasagne.layers.DenseLayer(l_y1, hidden2) l_local = lasagne.layers.DenseLayer( l_y2, num_lables, b=None, nonlinearity=lasagne.nonlinearities.linear) g1 = T.fmatrix() y1 = T.fmatrix() c_params0 = lasagne.layers.get_all_params(l_y2, trainable=True) c_params1 = lasagne.layers.get_all_params(l_local, trainable=True) f = open(params.FeatureNet, 'rb') para = pickle.load(f) f.close() for idx, p in enumerate(c_params1): if idx < (len(c_params1) - 1): p.set_value(para[idx]) else: p.set_value(-para[idx]) local_cost = lasagne.layers.get_output(l_local, {l_in: g1}) local_cost = T.sum(local_cost * y1, axis=1) ## for the global energy function l_in1 = lasagne.layers.InputLayer((None, num_lables)) l_label1 = lasagne.layers.DenseLayer( l_in1, C1, nonlinearity=lasagne.nonlinearities.softplus) l_label2 = lasagne.layers.DenseLayer( l_label1, 1, b=None, nonlinearity=lasagne.nonlinearities.linear) global_cost = lasagne.layers.get_output(l_label2, {l_in1: y1}) global_cost = T.sum(global_cost, axis=1) d_params = lasagne.layers.get_all_params(l_label2) d_params.append(l_local.W) self.d_params = d_params energy_cost = local_cost + global_cost self.cost_function = theano.function([g1, y1], energy_cost) """ for the inference network """ g2 = T.fmatrix() l_in_a = lasagne.layers.InputLayer((None, num_features)) l_y1_a = lasagne.layers.DenseLayer(l_in_a, hidden1_a) l_y2_a = lasagne.layers.DenseLayer(l_y1_a, hidden2_a) l_local_a = lasagne.layers.DenseLayer( l_y2_a, num_lables, b=None, nonlinearity=lasagne.nonlinearities.sigmoid) a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) self.a_params = a_params f = open(params.infNet, 'rb') PARA = pickle.load(f) f.close() for idx, p in enumerate(a_params): p.set_value(PARA[idx]) train_y = lasagne.layers.get_output(l_local_a, {l_in_a: g2}) self.a_function = theano.function([g2], train_y) g = T.fmatrix() y = T.fmatrix() predy = lasagne.layers.get_output(l_local_a, {l_in_a: g}) local_cost = lasagne.layers.get_output(l_local, {l_in: g}) pos_local_cost = T.sum(local_cost * y, axis=1) neg_local_cost = T.sum(local_cost * predy, axis=1) pos_global_cost = lasagne.layers.get_output(l_label2, {l_in1: y}) neg_global_cost = lasagne.layers.get_output(l_label2, {l_in1: predy}) yy = T.cast(y, 'int32') delta0 = T.sum((y - predy)**2, axis=1) margin_type = params.margin_type if (margin_type == 0): hinge_cost = delta0 - (neg_local_cost + T.sum( neg_global_cost, axis=1)) + (pos_local_cost + T.sum(pos_global_cost, axis=1)) elif (margin_type == 1): hinge_cost = 1 - (neg_local_cost + T.sum( neg_global_cost, axis=1)) + (pos_local_cost + T.sum(pos_global_cost, axis=1)) elif (margin_type == 2): hinge_cost = -(neg_local_cost + T.sum(neg_global_cost, axis=1)) + ( pos_local_cost + T.sum(pos_global_cost, axis=1)) elif (margin_type == 3): hinge_cost = delta0 * ( 1 - (neg_local_cost + T.sum(neg_global_cost, axis=1)) + (pos_local_cost + T.sum(pos_global_cost, axis=1))) hinge_cost = hinge_cost * T.gt(hinge_cost, 0) d_cost = T.mean(hinge_cost) d_cost0 = d_cost margin_pred_y_loss = -T.mean(predy * T.log(predy) + (1 - predy) * T.log(1 - predy)) g_cost = -d_cost + L2 * sum( lasagne.regularization.l2(x) for x in a_params) + params.regu_pretrain * sum( lasagne.regularization.l2(x - PARA[index]) for index, x in enumerate(a_params)) + margin_pred_y_loss d_cost = d_cost + L2 * sum( lasagne.regularization.l2(x) for x in d_params) self.a_params = a_params updates_g = lasagne.updates.adam(g_cost, a_params, eta) self.train_g = theano.function([g, y], [g_cost, d_cost0, margin_pred_y_loss], updates=updates_g) updates_d = lasagne.updates.adam(d_cost, d_params, eta) self.train_d = theano.function([g, y], [d_cost, d_cost0], updates=updates_d) t0 = T.fscalar() g0 = T.fmatrix() y00 = T.imatrix() local_cost0 = lasagne.layers.get_output(l_local, {l_in: g0}) predy0 = lasagne.layers.get_output(l_local_a, {l_in_a: g0}, deterministic=True) pred_test = T.gt(predy0, t0) neg_local_cost0 = T.sum(local_cost0 * predy0, axis=1) neg_global_cost0 = lasagne.layers.get_output(l_label2, {l_in1: predy0}) energy_cost20 = T.mean(neg_local_cost0 + T.sum(neg_global_cost0, axis=1)) energy_cost2 = energy_cost20 - T.mean(predy0 * T.log(predy0) + (1 - predy0) * T.log(1 - predy0)) ############# ## optimizer for final returning of the inference network updates_test = lasagne.updates.adam(energy_cost2, a_params, 0.00001) y0 = T.imatrix() neg_local_cost_test = T.sum(local_cost0 * pred_test, axis=1) neg_global_cost_test = lasagne.layers.get_output( l_label2, {l_in1: pred_test}) energy_cost_test = T.mean(neg_local_cost_test + T.sum(neg_global_cost_test, axis=1)) pg = T.eq(pred_test, y0) prec = 1.0 * (T.sum(pg * y0, axis=1) + eps) / (T.sum(pred_test, axis=1) + eps) recall = 1.0 * (T.sum(pg * y0, axis=1) + eps) / (T.sum(y0, axis=1) + eps) f1 = 2 * prec * recall / (prec + recall) f1 = T.mean(f1) prec = T.mean(prec) recall = T.mean(recall) self.test = theano.function([g0], [energy_cost20, energy_cost2], updates=updates_test) self.test_a = theano.function([g0, y0, t0], [ energy_cost20, prec, recall, f1, T.sum(pred_test, axis=1), energy_cost_test ]) self.test_time = theano.function([g0], predy0)
import numpy as np import theano import theano.tensor as T x = T.fscalar('x') y = T.fscalar('y') f = x**10 + y f_fn = theano.function([x, y], f) print "f(x, y) =", theano.printing.pprint(f) print "f(2, 3) = ", f_fn(2, 3) print "=" * 30 X = T.fmatrix('X') y = T.fvector('y') f = X.dot(y) f_fn = theano.function([X, y], f) X0 = np.array(range(90), dtype=np.float32).reshape(9, 10) y0 = np.array(range(10), dtype=np.float32) print "f(X, y) =", theano.printing.pprint(f) print "X =" print X0 print "y =" print y0 print "Xy =" print f_fn(X0, y0)
def PSD_conv_linear_combination(): Decoder_size =(1, 64, 11,11) Encoder_size = ( 64,1,11,11) phi = T.tanh Data_type = 1 if Data_type == 1: print "... Loading Cat_vs_Dog data" size =70 m = size n = size Input_size = [size,size] train, valid, test = load_gray() x_test =test[0].astype('float32') y_test = test[1].astype('int32') x_valid = valid[0].astype('float32') y_valid = valid[1].astype('int32') x_train = train[0].astype('float32') y_train = train[1].astype('int32') meanstd_train = x_train.std() mean = x_train.mean(1).reshape(( x_train.shape[0],1)) var = x_train.std(1).reshape(( x_train.shape[0],1))+ 0.1 * meanstd_train mean2 = x_test.mean(1).reshape(( x_test.shape[0],1)) meanstd_test = x_test.std() var2 = x_test.std(1).reshape(( x_test.shape[0],1)) + 0.1 * meanstd_test x_train -= mean x_train /= var x_test -= mean2 x_test /= var2 train_set_x= theano.shared(np.array(x_train.reshape((x_train.shape[0],1,size,size)), dtype='float32')) train_set_y= theano.shared(np.array(y_train, dtype='int32')) test_set_y= theano.shared(np.array(y_test, dtype='int32')) test_set_x= theano.shared(np.array(x_test.reshape((x_test.shape[0],1,size,size)), dtype='float32')) train_set_x = T.reshape(lecun_lcn(train_set_x, kernel_size=7, threshold = 1e-4, use_divisor=False), ((x_train.shape[0],size*size))) test_set_x = T.reshape(lecun_lcn( test_set_x, kernel_size=7, threshold = 1e-4, use_divisor=False), ((x_test.shape[0],size*size))) train_set_x /= T.reshape( T.max(train_set_x,axis=1)*1.0,(x_train.shape[0],1)) test_set_x /=T.reshape( T.max(test_set_x , axis=1)*1.0,(x_test.shape[0],1)) dispims(np.array(f()).reshape((x_train.shape[0], size*size))[:100].transpose(), size,size, 0, layout=(10,10), name='data.png' ) batch_size = 200 Lambda = 10.0**(-3) w = Input_size[0] s = Decoder_size[2] h = Input_size[1] M = Decoder_size[1] Sparse_Matrix_size = ( batch_size, M, w+s-1,h+s-1) rng = np.random.RandomState() index = T.lscalar() x = T.cast(T.fmatrix('x'), 'float32') y = T.ivector('y') I = T.cast(x.reshape((batch_size, Input_size[0],Input_size[1])), 'float32') I = I.dimshuffle(0,'x',1, 2) fan_in = np.prod(Decoder_size[1:]) fan_out = (Decoder_size[0] * np.prod(Decoder_size[2:]) )/(np.prod((2.,2.))) D_bound = 0.01 Decoder_Matrix =theano.shared( np.asarray( rng.uniform(low=-D_bound, high=D_bound, size = Decoder_size), dtype='float32'), borrow=True) En_bound = 0.01 Encoder_Matrix =theano.shared( np.asarray( rng.uniform(low=-D_bound, high=En_bound, size = Encoder_size), dtype='float32'), borrow=True) E_bound = 0.01 Esparse_Matrix = theano.shared( np.asarray( rng.uniform(low=-E_bound, high=E_bound, size =Sparse_Matrix_size ), dtype='float32' ), borrow=True) P1 =T.reshape( theano.sandbox.cuda.dnn.dnn_conv( Esparse_Matrix, Decoder_Matrix),(batch_size, Input_size[0],Input_size[1])) Sum = T.reshape(T.mean(P1, axis=0),(1, Input_size[0],Input_size[1])) E1 =0.5*T.sum(T.mean(( I-Sum)**2, axis=0)) Encoder_mapp = T.reshape (T.tanh( theano.sandbox.cuda.dnn.dnn_conv( I, Encoder_Matrix,'full' )),Sparse_Matrix_size ) Reconstruction = theano.sandbox.cuda.dnn.dnn_conv( img= Encoder_mapp, kerns= Decoder_Matrix ) get_Reconstruction = theano.function ( [index], Reconstruction, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size] } ) P2 = T.sum(T.mean( ( Esparse_Matrix -Encoder_mapp )**2, axis=0) ) cost = E1 + P2 + Lambda*T.sum(T.mean(abs(Esparse_Matrix), axis =0)) test_model = theano.function( [index], cost, givens={ x: test_set_x[index * batch_size: (index + 1) * batch_size] } ) test_traint_model = theano.function( [index], cost, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size] } ) Esparse_Matrix_grad = T.grad(cost, [Esparse_Matrix]) Esparse_lr = T.fscalar() Esparse_Update = gradient_updates_momentum(cost, params =[Esparse_Matrix] , learning_rate = Esparse_lr, momentum=0.9) train_Esparse_model = theano.function( [index, Esparse_lr ], cost, updates= Esparse_Update, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size] } ) get_Z_grad = theano.function( [index], Esparse_Matrix_grad , givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size] }) U_d = [ Decoder_Matrix] U_e = [ Encoder_Matrix] U_grads_d = T.grad( cost, U_d) U_grads_e = T.grad( cost, U_e) L_rate_encoder = T.fscalar() L_rate_decoder = T.fscalar() U_updates = [ (param_i,( param_i - L_rate_decoder * grad_i)/(1E-3 + T.sqrt( T.sum(( param_i - L_rate_decoder * grad_i)**2 , axis=0)))) for param_i, grad_i in zip(U_d, U_grads_d) ] + [ (param_i, param_i - L_rate_encoder * grad_i) for param_i, grad_i in zip(U_e, U_grads_e) ] train_ED_model = theano.function( [index, L_rate_encoder,L_rate_decoder], cost, updates=U_updates, givens={ x: train_set_x[index * batch_size: (index + 1) * batch_size] } ) F = T.reshape( abs(Encoder_mapp), (batch_size, M*(( w+s-1)**2))) meanstd_F = F.std() F -= F.mean(1)[:,None] F /= F.std(1)[False:,None]+ 0.1 * meanstd_F
import theano import theano.tensor as T # define some symbolic variables theano_matrix1 = T.matrix(name='theano_matrix1') theano_matrix2 = T.matrix(name='theano_matrix2') # define some functions # dot product/matrix product theano_dot = theano.function([theano_matrix1, theano_matrix2], T.dot(theano_matrix1, theano_matrix2), name='theano_dot') theano_scalar = T.fscalar(name='theano_scalar') theano_scale = theano.function([theano_matrix1, theano_scalar], theano_matrix1 * theano_scalar, name='scale') # elementwise product theano_multiply = theano.function([theano_matrix1, theano_matrix2], theano_matrix1 * theano_matrix2, name='theano_multiply') theano_row_vector = T.row(name='theano_row_vector') theano_col_vector = T.col(name='theano_col_vector') theano_subtract_row = theano.function([theano_matrix1, theano_row_vector], theano_matrix1 - theano_row_vector, name='theano_subtract_row')
def __init__(self, q, p, prior=None, n_batch=100, optimizer=lasagne.updates.adam, optimizer_params={}, clip_grad=None, max_norm_constraint=None, train_iw=False, test_iw=True, iw_alpha=0, seed=1234): super(VAE, self).__init__(n_batch=n_batch, seed=seed) self.q = q self.p = p if prior: self.prior = prior else: self.prior = get_prior(self.q) # set prior distribution mode if self.prior.__class__.__name__ == "MultiPriorDistributions": self.prior.prior = get_prior(self.q.distributions[-1]) self.prior_mode = "MultiPrior" else: self.prior_mode = "Normal" self.train_iw = train_iw self.test_iw = test_iw # set inputs x = self.q.inputs l = T.iscalar("l") k = T.iscalar("k") annealing_beta = T.fscalar("beta") # training if self.train_iw: inputs = x + [l, k] lower_bound, loss, params = self._vr_bound(x, l, k, iw_alpha, False) else: inputs = x + [l, annealing_beta] lower_bound, loss, params = self._elbo(x, l, annealing_beta, False) lower_bound = T.mean(lower_bound, axis=0) updates = self._get_updates(loss, params, optimizer, optimizer_params, clip_grad, max_norm_constraint) self.lower_bound_train = theano.function(inputs=inputs, outputs=lower_bound, updates=updates, on_unused_input='ignore') # test if self.test_iw: inputs = x + [l, k] lower_bound, _, _ = self._vr_bound(x, l, k, 0, True) else: inputs = x + [l] lower_bound, _, _ = self._elbo(x, l, 1, True) lower_bound = T.sum(lower_bound, axis=1) self.lower_bound_test = theano.function(inputs=inputs, outputs=lower_bound, on_unused_input='ignore')
def __init__(self, datasets, nkerns=[32, 48], batch_size=1000, normalized_width=20, distortion=0, cuda_convnet=1, params=[None, None, None, None, None, None, None, None]): """ Demonstrates Ciresan 2012 on MNIST dataset Some minor differences here: --- - Ciresan initializes Conv layers with: "uniform random distribution in the range [−0.05, 0.05]." (Ciresan IJCAI 2011) - Ciresan uses a sigma of 6 - Ciresan uses nkerns=[20, 40] which were increased here to be nkerns=[32, 48] in order to be compatible with cuda_convnet :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: path to the dataset used for training /testing (MNIST here) :type nkerns: list of ints :param nkerns: number of kernels on each layer :type params: list of None or Numpy matricies/arrays :param params: W/b weights in the order: layer3W, layer3b, layer2W, layer2b, layer1W, layer1b, layer0W, layer0b """ layer3W, layer3b, layer2W, layer2b, layer1W, layer1b, layer0W, layer0b = params rng = numpy.random.RandomState(23455) # TODO: could make this a theano sym variable to abstract # loaded data from column instantiation train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # TODO: could move this to train method # compute number of minibatches for training, validation and testing self.n_train_batches = train_set_x.get_value(borrow=True).shape[0] self.n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] self.n_test_batches = test_set_x.get_value(borrow=True).shape[0] self.n_train_batches /= batch_size self.n_valid_batches /= batch_size self.n_test_batches /= batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch learning_rate = T.fscalar() # start-snippet-1 x = T.matrix('x') # the data is presented as rasterized images y = T.ivector('y') # the labels are presented as 1D vector of # [int] labels ###################### # BUILD ACTUAL MODEL # ###################### print '... building the column' if distortion: distortion_layer = ElasticLayer(x.reshape((batch_size, 29, 29)), 29, magnitude=ALPHA, sigma=SIGMA) network_input = distortion_layer.output.reshape( (batch_size, 1, 29, 29)) else: network_input = x.reshape((batch_size, 1, 29, 29)) if cuda_convnet: layer0_input = network_input.dimshuffle(1, 2, 3, 0) else: layer0_input = network_input layer0_imageshape = (1, 29, 29, batch_size) if cuda_convnet else (batch_size, 1, 29, 29) layer0_filtershape = (1, 4, 4, nkerns[0]) if cuda_convnet else (nkerns[0], 1, 4, 4) layer0 = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=layer0_imageshape, filter_shape=layer0_filtershape, poolsize=(2, 2), cuda_convnet=cuda_convnet, W=layer0W, b=layer0b) layer1_imageshape = (nkerns[0], 13, 13, batch_size) if cuda_convnet else (batch_size, nkerns[0], 13, 13) layer1_filtershape = (nkerns[0], 5, 5, nkerns[1]) if cuda_convnet else (nkerns[1], nkerns[0], 5, 5) layer1 = LeNetConvPoolLayer(rng, input=layer0.output, image_shape=layer1_imageshape, filter_shape=layer1_filtershape, poolsize=(3, 3), cuda_convnet=cuda_convnet, W=layer1W, b=layer1b) # the HiddenLayer being fully-connected, it operates on 2D matrices of # shape (batch_size, num_pixels) (i.e matrix of rasterized images). # This will generate a matrix of shape (batch_size, nkerns[1] * 4 * 4), # or (500, 50 * 4 * 4) = (500, 800) with the default values. if cuda_convnet: layer2_input = layer1.output.dimshuffle(3, 0, 1, 2).flatten(2) else: layer2_input = layer1.output.flatten(2) layer2 = HiddenLayer(rng, input=layer2_input, n_in=nkerns[1] * 3 * 3, n_out=150, W=layer2W, b=layer2b, activation=T.tanh) # classify the values of the fully-connected sigmoidal layer layer3 = LogisticRegression(input=layer2.output, n_in=150, n_out=10, W=layer3W, b=layer3b) # the cost we minimize during training is the NLL of the model cost = layer3.negative_log_likelihood(y) # create a function to compute the mistakes that are made by the model self.test_model = theano.function( [index], layer3.errors(y), givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size], y: test_set_y[index * batch_size:(index + 1) * batch_size] }) # create a function to compute probabilities of all output classes self.test_output_batch = theano.function( [index], layer3.p_y_given_x, givens={ x: test_set_x[index * batch_size:(index + 1) * batch_size] }) self.validate_model = theano.function( [index], layer3.errors(y), givens={ x: valid_set_x[index * batch_size:(index + 1) * batch_size], y: valid_set_y[index * batch_size:(index + 1) * batch_size] }) # create a list of all model parameters to be fit by gradient descent self.params = layer3.params + layer2.params + layer1.params + layer0.params self.column_params = [ nkerns, batch_size, normalized_width, distortion, cuda_convnet ] # create a list of gradients for all model parameters grads = T.grad(cost, self.params) # train_model is a function that updates the model parameters by # SGD Since this model has many parameters, it would be tedious to # manually create an update rule for each model parameter. We thus # create the updates list by automatically looping over all # (params[i], grads[i]) pairs. updates = [(param_i, param_i - (learning_rate) * grad_i) for param_i, grad_i in zip(self.params, grads)] # Suggested by Alex Krizhevsky, found on: # http://yyue.blogspot.com/2015/01/a-brief-overview-of-deep-learning.html optimal_ratio = 0.001 # should show what multiple current learning rate is of optimal learning rate grads_L1 = sum([abs(grad).sum() for grad in grads]) params_L1 = sum([abs(param).sum() for param in self.params]) update_ratio = (learning_rate / (optimal_ratio)) * (grads_L1 / params_L1) self.train_model = theano.function( [index, learning_rate], [cost, update_ratio], updates=updates, givens={ x: train_set_x[index * batch_size:(index + 1) * batch_size], y: train_set_y[index * batch_size:(index + 1) * batch_size] })
def find_Y(X_shared, Y_shared, sigma_shared, N, output_dims, n_epochs, initial_lr, final_lr, lr_switch, init_stdev, initial_momentum, final_momentum, momentum_switch, initial_l_kl, final_l_kl, l_kl_switch, initial_l_e, final_l_e, l_e_switch, initial_l_c, final_l_c, l_c_switch, initial_l_r, final_l_r, l_r_switch, r_eps, Adj_shared, g=None, save_every=None, output_folder=None, verbose=0): # Optimization hyperparameters initial_lr = np.array(initial_lr, dtype=floath) final_lr = np.array(final_lr, dtype=floath) initial_momentum = np.array(initial_momentum, dtype=floath) final_momentum = np.array(final_momentum, dtype=floath) # Hyperparameters used within Theano lr = T.fscalar('lr') lr_shared = theano.shared(initial_lr) momentum = T.fscalar('momentum') momentum_shared = theano.shared(initial_momentum) # Cost parameters initial_l_kl = np.array(initial_l_kl, dtype=floath) final_l_kl = np.array(final_l_kl, dtype=floath) initial_l_e = np.array(initial_l_e, dtype=floath) final_l_e = np.array(final_l_e, dtype=floath) initial_l_c = np.array(initial_l_c, dtype=floath) final_l_c = np.array(final_l_c, dtype=floath) initial_l_r = np.array(initial_l_r, dtype=floath) final_l_r = np.array(final_l_r, dtype=floath) # Cost parameters used within Theano l_kl = T.fscalar('l_kl') l_kl_shared = theano.shared(initial_l_kl) l_e = T.fscalar('l_e') l_e_shared = theano.shared(initial_l_e) l_c = T.fscalar('l_c') l_c_shared = theano.shared(initial_l_c) l_r = T.fscalar('l_r') l_r_shared = theano.shared(initial_l_r) # High-dimensional observations (connectivities of vertices) X = T.fmatrix('X') # 2D projection (coordinates of vertices) Y = T.fmatrix('Y') # Adjacency matrix Adj = T.fmatrix('Adj') # Standard deviations used for Gaussians to attain perplexity sigma = T.fvector('sigma') # Y velocities (for momentum-based descent) Yv = T.fmatrix('Yv') Yv_shared = theano.shared(np.zeros((N, output_dims), dtype=floath)) # Function for retrieving cost for all individual data points costs = cost_var(X, Y, sigma, Adj, l_kl, l_e, l_c, l_r, r_eps) # Sum of all costs (scalar) cost = T.sum(costs) # Gradient of the cost w.r.t. Y grad_Y = T.grad(cost, Y) # Update step for velocity update_Yv = theano.function( [], None, givens={ X: X_shared, sigma: sigma_shared, Y: Y_shared, Yv: Yv_shared, Adj: Adj_shared, lr: lr_shared, momentum: momentum_shared, l_kl: l_kl_shared, l_e: l_e_shared, l_c: l_c_shared, l_r: l_r_shared }, updates=[(Yv_shared, momentum * Yv - lr * grad_Y)]) # Gradient descent step update_Y = theano.function([], [], givens={ Y: Y_shared, Yv: Yv_shared }, updates=[(Y_shared, Y + Yv)]) # Build function to retrieve cost get_cost = theano.function( [], cost, givens={ X: X_shared, sigma: sigma_shared, Y: Y_shared, Adj: Adj_shared, l_kl: l_kl_shared, l_e: l_e_shared, l_c: l_c_shared, l_r: l_r_shared }) # Build function to retrieve per-vertex cost get_costs = theano.function( [], costs, givens={ X: X_shared, sigma: sigma_shared, Y: Y_shared, Adj: Adj_shared, l_kl: l_kl_shared, l_e: l_e_shared, l_c: l_c_shared, l_r: l_r_shared }) # Optimization loop for epoch in range(n_epochs): # Switch parameter if a switching point is reached. if epoch == lr_switch: lr_shared.set_value(final_lr) if epoch == momentum_switch: momentum_shared.set_value(final_momentum) if epoch == l_kl_switch: l_kl_shared.set_value(final_l_kl) if epoch == l_e_switch: l_e_shared.set_value(final_l_e) if epoch == l_c_switch: l_c_shared.set_value(final_l_c) if epoch == l_r_switch: l_r_shared.set_value(final_l_r) if final_l_r != 0: # Give a nudge to co-located vertices in the epoch before the # repulsion kicks in (otherwise they don't feel any). Y_shared.set_value(switch_shake(Y_shared.get_value())) # Do update step for velocity update_Yv() # Do a gradient descent step update_Y() c = get_cost() if np.isnan(float(c)): raise NaNException('Encountered NaN for cost.') if verbose: print('[tsne] Epoch: {0}. Cost: {1:.6f}.'.format( epoch + 1, float(c)), end='\r') if output_folder is not None and g is not None and save_every is not None and epoch % save_every == 0: # Get per-vertex cost for colour-coding cs = get_costs() # Save a snapshot save_drawing(output_folder, g, Y_shared.get_value().T, 'tsne_snap_' + str(epoch).zfill(5), formats=['jpg'], verbose=False, edge_colors="rgb", draw_vertices=False, opacity=0.3) # Get per-vertex cost cs = get_costs() if verbose: print('\n[tsne] Done! ') return np.array(Y_shared.get_value()), cs
def run_conv_nnet2(use_gpu): # pretend we are training LeNet for MNIST if use_gpu: shared_fn = tcn.shared_constructor else: shared_fn = shared # cumulativ rounding error affect this comparaison of result. So we lower the tolerance. # TODO: why the last two example see the error lower? We are converging? # n_train=10, n_batch=3, n_kern=1, n_kern1=1, error see of 1e-9 # n_train=10, n_batch=3, n_kern=10, n_kern1=1, error see of -1.27777e-06 # n_train=10, n_batch=3, n_kern=10, n_kern1=10, error see of -6.91377e-05 # n_train=10, n_batch=30, n_kern=10, n_kern1=10, error see of -0.00185963 # n_train=10, n_batch=60, n_kern=10, n_kern1=10, error see of -5.26905e-05 # n_train=30, n_batch=60, n_kern=10, n_kern1=10, error see of -3.8147e-06 # n_train=30, n_batch=60, n_kern=20, n_kern1=10, error see of 6.82771e-05 # n_train=30, n_batch=60, n_kern=20, n_kern1=30, error see of 0.000231534 n_batch = 60 shape_img = (n_batch, 1, 32, 32) n_kern = 20 shape_kern = (n_kern, 1, 5, 5) n_kern1 = 10 shape_kern1 = (n_kern1, n_kern, 5, 5) n_train = 30 if config.mode == 'DEBUG_MODE': n_train = 1 logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d( tuple(shape_img[2:]), tuple(shape_kern[2:]), 'valid') logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d( (logical_hid_shape[0] // 2, logical_hid_shape[1] // 2), tuple(shape_kern1[2:]), 'valid') n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1] n_out = 10 w0 = shared_fn(0.01 * (my_rand(*shape_kern) - 0.5), 'w0') b0 = shared_fn(my_zeros((n_kern, )), 'b0') w1 = shared_fn(0.01 * (my_rand(*shape_kern1) - 0.5), 'w1') b1 = shared_fn(my_zeros((n_kern1, )), 'b1') v = shared_fn(my_zeros((n_hid, n_out)), 'c') c = shared_fn(my_zeros(n_out), 'c') x = tensor.Tensor(dtype='float32', broadcastable=(0, 1, 0, 0))('x') y = tensor.fmatrix('y') lr = tensor.fscalar('lr') conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1) conv_op1 = conv.ConvOp( (n_kern, logical_hid_shape[0] // 2, logical_hid_shape[1] // 2), shape_kern1[2:], n_kern1, n_batch, 1, 1) hid = tensor.tanh(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x'))) hid1 = tensor.tanh( conv_op1(hid[:, :, ::2, ::2], w1) + b1.dimshuffle((0, 'x', 'x'))) hid_flat = hid1.reshape((n_batch, n_hid)) out = tensor.tanh(tensor.dot(hid_flat, v) + c) loss = tensor.sum(0.5 * (out - y)**2 * lr) # print 'loss type', loss.type params = [w0, b0, w1, b1, v, c] gparams = tensor.grad(loss, params) mode = get_mode(use_gpu) # print 'building pfunc ...' train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, g in zip(params, gparams)]) # for i, n in enumerate(train.maker.fgraph.toposort()): # print i, n xval = my_rand(*shape_img) yval = my_rand(n_batch, n_out) # int32 make all 0... lr = theano._asarray(0.01, dtype='float32') for i in xrange(n_train): rval = train(xval, yval, lr) print_mode(mode) return rval
def __init__( self, Nlayers=1, # number of layers Ndirs=1, # unidirectional or bidirectional Nx=100, # input size Nh=100, # hidden layer size Ny=100, # output size Ah="relu", # hidden unit activation (e.g. relu, tanh, lstm) Ay="linear", # output unit activation (e.g. linear, sigmoid, softmax) predictPer="frame", # frame or sequence loss=None, # loss function (e.g. mse, ce, ce_group, hinge, squared_hinge) L1reg=0.0, # L1 regularization L2reg=0.0, # L2 regularization momentum=0.0, # SGD momentum seed=15213, # random seed for initializing the weights frontEnd=None, # a lambda function for transforming the input filename=None, # initialize from file initParams=None, # initialize from given dict ): if filename is not None: # load parameters from file with smart_open(filename, "rb") as f: initParams = dill.load(f) if initParams is not None: # load parameters from given dict self.paramNames = [] self.params = [] for k, v in initParams.iteritems(): if type(v) is numpy.ndarray: self.addParam(k, v) else: setattr(self, k, v) self.paramNames.append(k) # F*ck, locals()[k] = v doesn't work; I have to do this statically Nlayers, Ndirs, Nx, Nh, Ny, Ah, Ay, predictPer, loss, L1reg, L2reg, momentum, frontEnd \ = self.Nlayers, self.Ndirs, self.Nx, self.Nh, self.Ny, self.Ah, self.Ay, self.predictPer, self.loss, self.L1reg, self.L2reg, self.momentum, self.frontEnd else: # Initialize parameters randomly # Names of parameters to save to file self.paramNames = [ "Nlayers", "Ndirs", "Nx", "Nh", "Ny", "Ah", "Ay", "predictPer", "loss", "L1reg", "L2reg", "momentum", "frontEnd" ] for name in self.paramNames: value = locals()[name] setattr(self, name, value) # Values of parameters for building the computational graph self.params = [] # Initialize random number generators global rng rng = numpy.random.RandomState(seed) # Construct parameter matrices Nlstm = 4 if Ah == 'lstm' else 1 self.addParam("Win", rand_init((Nx, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wrec", rand_init((Nlayers, Ndirs, Nh, Nh * Nlstm), Ah)) self.addParam( "Wup", rand_init((Nlayers - 1, Nh * Ndirs, Nh * Ndirs * Nlstm), Ah)) self.addParam("Wout", rand_init((Nh * Ndirs, Ny), Ay)) if Ah != "lstm": self.addParam("Bhid", zeros((Nlayers, Nh * Ndirs))) else: self.addParam( "Bhid", numpy.tile( numpy.hstack([ full((Nlayers, Nh), 1.0), zeros((Nlayers, Nh * 3)) ]), (1, Ndirs))) self.addParam("Bout", zeros(Ny)) self.addParam("h0", zeros((Nlayers, Ndirs, Nh))) if Ah == "lstm": self.addParam("c0", zeros((Nlayers, Ndirs, Nh))) # Compute total number of parameters self.nParams = sum(x.get_value().size for x in self.params) # Initialize gradient tensors when using momentum if momentum > 0: self.dparams = [ theano.shared(zeros(x.get_value().shape)) for x in self.params ] # Build computation graph input = T.ftensor3() mask = T.imatrix() mask_int = [(mask % 2).nonzero(), (mask >= 2).nonzero()] mask_float = [ T.cast((mask % 2).dimshuffle((1, 0)).reshape( (mask.shape[1], mask.shape[0], 1)), theano.config.floatX), T.cast((mask >= 2).dimshuffle((1, 0)).reshape( (mask.shape[1], mask.shape[0], 1)), theano.config.floatX) ] # mask_int = [(mask & 1).nonzero(), (mask & 2).nonzero()] # mask_float = [T.cast((mask & 1).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX), # T.cast(((mask & 2) / 2).dimshuffle((1, 0)).reshape((mask.shape[1], mask.shape[0], 1)), theano.config.floatX)] def step_rnn(x_t, mask, h_tm1, W, h0): h_tm1 = T.switch(mask, h0, h_tm1) return [ACTIVATION[Ah](x_t + h_tm1.dot(W))] def step_lstm(x_t, mask, c_tm1, h_tm1, W, c0, h0): c_tm1 = T.switch(mask, c0, c_tm1) h_tm1 = T.switch(mask, h0, h_tm1) a = x_t + h_tm1.dot(W) f_t = T.nnet.sigmoid(a[:, :Nh]) i_t = T.nnet.sigmoid(a[:, Nh:Nh * 2]) o_t = T.nnet.sigmoid(a[:, Nh * 2:Nh * 3]) c_t = T.tanh(a[:, Nh * 3:]) * i_t + c_tm1 * f_t h_t = T.tanh(c_t) * o_t return [c_t, h_t] x = input if frontEnd is None else frontEnd(input) for i in range(Nlayers): h = (x.dimshuffle((1, 0, 2)).dot(self.Win) if i == 0 else h.dot(self.Wup[i - 1])) + self.Bhid[i] rep = lambda x: T.extra_ops.repeat( x.reshape((1, -1)), h.shape[1], axis=0) if Ah != "lstm": h = T.concatenate([ theano.scan( fn=step_rnn, sequences=[ h[:, :, Nh * d:Nh * (d + 1)], mask_float[d] ], outputs_info=[rep(self.h0[i, d])], non_sequences=[self.Wrec[i, d], rep(self.h0[i, d])], go_backwards=(d == 1), )[0][::(1 if d == 0 else -1)] for d in range(Ndirs) ], axis=2) else: h = T.concatenate([ theano.scan( fn=step_lstm, sequences=[ h[:, :, Nh * 4 * d:Nh * 4 * (d + 1)], mask_float[d] ], outputs_info=[rep(self.c0[i, d]), rep(self.h0[i, d])], non_sequences=[ self.Wrec[i, d], rep(self.c0[i, d]), rep(self.h0[i, d]) ], go_backwards=(d == 1), )[0][1][::(1 if d == 0 else -1)] for d in range(Ndirs) ], axis=2) h = h.dimshuffle((1, 0, 2)) if predictPer == "sequence": h = T.concatenate([ h[mask_int[1 - d]][:, Nh * d:Nh * (d + 1)] for d in range(Ndirs) ], axis=1) output = ACTIVATION[Ay](h.dot(self.Wout) + self.Bout) # Compute loss function if loss is None: loss = { "linear": "mse", "sigmoid": "ce", "softmax": "ce_group" }[self.Ay] if loss == "ctc": label = T.imatrix() cost = ctc_cost(output, mask, label) else: if predictPer == "sequence": label = T.fmatrix() y = output t = label elif predictPer == "frame": label = T.ftensor3() indices = (mask >= 0).nonzero() y = output[indices] t = label[indices] cost = T.mean({ "ce": -T.mean(T.log(y) * t + T.log(1 - y) * (1 - t), axis=1), "ce_group": -T.log((y * t).sum(axis=1)), "mse": T.mean((y - t)**2, axis=1), "hinge": T.mean(relu(1 - y * (t * 2 - 1)), axis=1), "squared_hinge": T.mean(relu(1 - y * (t * 2 - 1))**2, axis=1), }[loss]) # Add regularization cost += sum(abs(x).sum() for x in self.params) / self.nParams * L1reg cost += sum(T.sqr(x).sum() for x in self.params) / self.nParams * L2reg # Compute updates for network parameters updates = [] lrate = T.fscalar() clip = T.fscalar() grad = T.grad(cost, self.params) grad_clipped = [T.maximum(T.minimum(g, clip), -clip) for g in grad] if momentum > 0: for w, d, g in zip(self.params, self.dparams, grad_clipped): updates.append( (w, w + momentum * momentum * d - (1 + momentum) * lrate * g)) updates.append((d, momentum * d - lrate * g)) else: for w, g in zip(self.params, grad_clipped): updates.append((w, w - lrate * g)) # Create functions to be called from outside self.train = theano.function( inputs=[input, mask, label, lrate, clip], outputs=cost, updates=updates, ) self.predict = theano.function(inputs=[input, mask], outputs=output)
def get_options(batchsize, nepochs, plotevery, learningrate, normalizegrads, clipgrads, enabledebug, optimizer, yzeromean, yunitvar, datadir, outputdir): global batch_size batch_size = batchsize global epochs epochs = nepochs print("Changing pwd to {}".format(outputdir)) os.chdir(outputdir) mydir = os.path.join(os.getcwd(), datetime.now().strftime('%Y-%m-%d_%H-%M-%S')) os.makedirs(mydir) os.chdir(mydir) app_name = sys.argv[0] global logger logger = get_logger(app_name=app_name, logfolder=mydir) # Load dataset X, Y = load_data(datadir + os.sep + "coulomb.txt", datadir + os.sep + "energies.txt") Y, Y_mean, Y_std, Y_binarized = preprocess_targets(Y, zero_mean=yzeromean, unit_var=yunitvar) [X_train, X_test], [Y_train, Y_test], splits = get_data_splits(X, Y, splits=[90, 10]) [Y_binarized_train, Y_binarized_test] = np.split(Y_binarized, splits)[:-1] np.savez('Y_vals.npz', Y_train=Y_train, Y_test=Y_test, Y_binarized_test=Y_binarized_test, Y_binarized_train=Y_binarized_train, Y_mean=Y_mean, Y_std=Y_std) np.savez('X_vals.npz', X_train=X_train, X_test=X_test) dataDim = X.shape[1:] outputDim = Y.shape[1] datapoints = len(X_train) print("datapoints = %d" % datapoints) # # making the datapoints shared variables # X_train = make_shared(X_train) # X_test = make_shared(X_test) # Y_train = make_shared(Y_train) # Y_test = make_shared(Y_test) # Y_binarized_train = make_shared(Y_binarized_train) # Y_binarized_test = make_shared(Y_binarized_test) # TODO !!!!I am here # print("Train set size {}, Train set (labelled) size {}, Test set size {}," + # "Validation set size {}".format( # train_set[0].size,train_set_labeled[0].size, # test_set[0].size, valid_set[0].size)) eigen_value_count = 20 # Defining the model now. th_coulomb = T.ftensor4() th_energies = T.fmatrix() th_energies_bin = T.fmatrix() th_learningrate = T.fscalar() l_input = InputLayer(shape=(None, 1, 29, 29), input_var=th_coulomb, name="Input") l_input = FlattenLayer(l_input, name="FlattenInput") l_pseudo_bin = DenseLayer(l_input, num_units=2000, nonlinearity=sigmoid, name="PseudoBinarized") l_h1 = [] l_h2 = [] l_realOut = [] l_binOut = [] for branch_num in range(eigen_value_count): l_h1.append( DenseLayer(l_pseudo_bin, num_units=1000, nonlinearity=rectify, name="hidden_1_%d" % branch_num)) l_h2.append( DenseLayer(l_h1[-1], num_units=400, nonlinearity=rectify, name="hidden_2_%d" % branch_num)) l_realOut.append( DenseLayer(l_h2[-1], num_units=1, nonlinearity=linear, name="realOut_%d" % branch_num)) l_binOut.append( DenseLayer(l_h2[-1], num_units=1, nonlinearity=sigmoid, name="binOut")) l_realOut_cat = ConcatLayer(l_realOut, name="real_concat") l_binOut_cat = ConcatLayer(l_binOut, name="bin_concat") l_output = ElemwiseMergeLayer([l_binOut_cat, l_realOut_cat], T.mul, name="final_output") energy_output = get_output(l_output) binary_output = get_output(l_binOut_cat) loss_real = T.mean(abs(energy_output - th_energies)) loss_binary = T.mean(binary_crossentropy(binary_output, th_energies_bin)) loss = loss_real + loss_binary params = get_all_params(l_output) grad = T.grad(loss, params) if normalizegrads is not None: grad = lasagne.updates.total_norm_constraint(grad, max_norm=normalizegrads) if clipgrads is not None: grad = [T.clip(g, -clipgrads, clipgrads) for g in grad] optimization_algo = get_optimizer[optimizer] # updates = optimization_algo(grad, params, learning_rate=learningrate) updates = optimization_algo(grad, params, learning_rate=th_learningrate) train_fn = theano.function( [th_coulomb, th_energies, th_energies_bin, th_learningrate], [loss, energy_output], updates=updates, allow_input_downcast=True) get_grad = theano.function([th_coulomb, th_energies, th_energies_bin], grad) # get_updates = theano.function([th_data, th_labl], [updates.values()]) # val_fn = theano.function([th_coulomb, th_energies, th_energies_bin], [loss, energy_output], updates=updates, allow_input_downcast=True) val_fn = theano.function([th_coulomb, th_energies, th_energies_bin], [loss, energy_output], allow_input_downcast=True) datapoints = len(X_train) print("datapoints = %d" % datapoints) with open(os.path.join(mydir, "data.txt"), "w") as f: script = app_name for elem in [ "meta_seed", "dataDim", "batch_size", "epochs", "learningrate", "normalizegrads", "clipgrads", "enabledebug", "optimizer", "plotevery", "script" ]: f.write("{} : {}\n".format(elem, eval(elem))) train_loss_lowest = np.inf test_loss_lowest = np.inf for epoch in range(epochs): batch_start = 0 train_loss = [] if learningrate == None: if epoch < 50: learning_rate = 0.0001 elif epoch < 100: learning_rate = 0.00001 elif epoch < 500: learning_rate = 0.000001 else: learning_rate = 0.0000001 else: learning_rate = learningrate indices = np.random.permutation(datapoints) minibatches = int(datapoints / batch_size) for minibatch in range(minibatches): train_idxs = indices[batch_start:batch_start + batch_size] X_train_batch = X_train[train_idxs, :] Yr_train_batch = Y_train[train_idxs, :] Yb_train_batch = Y_binarized_train[train_idxs, :] train_output = train_fn(X_train_batch, Yr_train_batch, Yb_train_batch, learning_rate) batch_start = batch_start + batch_size train_loss.append(train_output[0]) if enabledebug: # Debugging information batchIdx = epoch * minibatches + minibatch fn = 'params_{:>010d}'.format() # saving params param_values = get_all_param_values(l_output) param_norm = np.linalg.norm( np.hstack([param.flatten() for param in param_values])) gradients = get_grad(X_train_batch, Yr_train_batch, Yb_train_batch) gradient_norm = np.linalg.norm( np.hstack([gradient.flatten() for gradient in gradients])) logger.debug( "Epoch : {:0>4} minibatch {:0>3} Gradient Norm : {:>0.4}, Param Norm : {:>0.4} GradNorm/ParamNorm : {:>0.4} (Values from Prev. Minibatch) Train loss {}" .format(epoch, minibatch, gradient_norm, param_norm, gradient_norm / param_norm, train_loss[-1])) param_names = [ param.__str__() for param in get_all_params(l_output) ] np.savez(fn + '.npz', **dict(zip(param_names, param_values))) np.savez('Y_train_pred_{}.npz'.format(batchIdx), Y_train_pred=train_output[1]) if train_loss[-1] < train_loss_lowest: train_loss_lowest = train_loss[-1] np.savez('Y_train_pred_best.npz', Y_train_pred=train_output[1]) logger.debug( "Found the best training prediction (Y_train_pred_best) at %d epoch %d minibatch" % (epoch, minibatch)) if np.isnan(gradient_norm): pdb.set_trace() if (epoch % plotevery == 0): logger.info("Epoch {} of {}".format(epoch, epochs)) fn = 'params_{:>03d}'.format(epoch) # saving params param_values = get_all_param_values(l_output) param_norm = np.linalg.norm( np.hstack([param.flatten() for param in param_values])) param_names = [ param.__str__() for param in get_all_params(l_output) ] if not enabledebug: np.savez(fn + '.npz', **dict(zip(param_names, param_values))) np.savez('Y_train_pred_{}.npz'.format(epoch), Y_train_pred=train_output[1]) mean_train_loss = np.mean(train_loss) if mean_train_loss < train_loss_lowest: train_loss_lowest = mean_train_loss np.savez('Y_train_pred_best.npz', Y_train_pred=train_output[1]) logger.info( "Found the best training prediction (Y_train_pred_best) at %d epoch" % epoch) gradients = get_grad(X_train_batch, Yr_train_batch, Yb_train_batch) gradient_norm = np.linalg.norm( np.hstack([gradient.flatten() for gradient in gradients])) logger.info( " Gradient Norm : {:>0.4}, Param Norm : {:>0.4} GradNorm/ParamNorm : {:>0.4} " .format(gradient_norm, param_norm, gradient_norm / param_norm)) logger.info(" Train loss {:>0.4}".format(np.mean(train_loss))) test_loss, test_prediction = val_fn(X_test, Y_test, Y_binarized_test) np.savez('Y_test_pred_{}.npz'.format(epoch), Y_test_pred=test_prediction) logger.info(" Test loss {}".format(test_loss)) if test_loss < test_loss_lowest: test_loss_lowest = test_loss np.savez('Y_test_pred_best.npz', Y_test_pred=test_prediction) logger.info( "Found the best test prediction (Y_test_pred_best) at %d epoch" % epoch)
def __init__(self, We_initial, char_embedd_table_initial, params): We = theano.shared(We_initial) We_inf = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden hidden_inf = params.hidden_inf input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') length = T.iscalar() t_t = T.fscalar() Wyy0 = np.random.uniform( -0.02, 0.02, (params.num_labels + 1, params.num_labels)).astype('float32') Wyy = theano.shared(Wyy0) char_input_var = T.itensor3() char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) char_embedd_table_inf = theano.shared(char_embedd_table_initial) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We) else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer( layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. l_emb_word = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) l_lstm_wordf = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(l_emb_word, hidden, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * hidden)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=params.num_labels, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open( 'ccctag_BiLSTM_CNN_CRF_num_filters_30_dropout_1_LearningRate_0.01_0.0_400_emb_1_tagversoin_2.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) l_in_word_a = lasagne.layers.InputLayer((None, None)) l_mask_word_a = lasagne.layers.InputLayer(shape=(None, None)) l_emb_word_a = lasagne.layers.EmbeddingLayer( l_in_word_a, input_size=We_initial.shape[0], output_size=embsize, W=We_inf, name='inf_word_embedding') layer_char_input_a = lasagne.layers.InputLayer( shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char_a = lasagne.layers.reshape(layer_char_input_a, (-1, [2])) layer_char_embedding_a = lasagne.layers.EmbeddingLayer( layer_char_a, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table_inf, name='char_embedding') layer_char_a = lasagne.layers.DimshuffleLayer(layer_char_embedding_a, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters #_, sent_length, _ = incoming2.output_shape # dropout before cnn? if params.dropout: layer_char_a = lasagne.layers.DropoutLayer(layer_char_a, p=0.5) # construct convolution layer cnn_layer_a = lasagne.layers.Conv1DLayer( layer_char_a, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) #_, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer_a = lasagne.layers.MaxPool1DLayer(cnn_layer_a, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer_a = lasagne.layers.reshape(pool_layer_a, (-1, length, [1])) # finally, concatenate the two incoming layers together. l_emb_word_a = lasagne.layers.concat( [output_cnn_layer_a, l_emb_word_a], axis=2) if params.dropout: l_emb_word_a = lasagne.layers.DropoutLayer(l_emb_word_a, p=0.5) l_lstm_wordf_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a) l_lstm_wordb_a = lasagne.layers.LSTMLayer(l_emb_word_a, hidden_inf, mask_input=l_mask_word_a, backwards=True) l_reshapef_a = lasagne.layers.ReshapeLayer(l_lstm_wordf_a, (-1, hidden_inf)) l_reshapeb_a = lasagne.layers.ReshapeLayer(l_lstm_wordb_a, (-1, hidden_inf)) concat2_a = lasagne.layers.ConcatLayer([l_reshapef_a, l_reshapeb_a]) if params.dropout: concat2_a = lasagne.layers.DropoutLayer(concat2_a, p=0.5) l_local_a = lasagne.layers.DenseLayer( concat2_a, num_units=params.num_labels, nonlinearity=lasagne.nonlinearities.softmax) a_params = lasagne.layers.get_all_params(l_local_a, trainable=True) self.a_params = a_params def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output( l_local, { l_in_word: input_var, l_mask_word: mask_var, layer_char_input_a: char_input_var }) local_energy = local_energy.reshape((-1, length, params.num_labels)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] predy0 = lasagne.layers.get_output( l_local_a, { l_in_word_a: input_var, l_mask_word_a: mask_var, layer_char_input_a: char_input_var }) predy_inf = lasagne.layers.get_output( l_local_a, { l_in_word_a: input_var, l_mask_word_a: mask_var, layer_char_input_a: char_input_var }, deterministic=True) predy_inf = predy_inf.reshape((-1, length, params.num_labels)) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, params.num_labels) A = A.reshape((-1, length, params.num_labels)) predy = predy0.reshape((-1, length, params.num_labels)) predy = predy * mask_var[:, :, None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) cost = T.mean(-cost11) ### compute the energy for inference step predy_inf = predy_inf * mask_var[:, :, None] targets_inf_shuffled = predy_inf.dimshuffle(1, 0, 2) target_inf_time0 = targets_inf_shuffled[0] initial_inf_energy0 = T.dot(target_inf_time0, Wyy[-1, :-1]) initials_inf = [target_inf_time0, initial_inf_energy0] [_, target_inf_energies], _ = theano.scan( fn=inner_function, outputs_info=initials_inf, sequences=[targets_inf_shuffled[1:], masks_shuffled[1:]]) cost_inf = target_inf_energies[-1] + T.sum( T.sum(local_energy * predy_inf, axis=2) * mask_var, axis=1) #from adam import adam #updates_a = adam(cost, a_params, params.eta) updates_a = lasagne.updates.sgd(cost, a_params, params.eta) updates_a = lasagne.updates.apply_momentum(updates_a, a_params, momentum=0.9) self.train_fn = theano.function( [input_var, char_input_var, mask_var, mask_var1, length], cost, updates=updates_a, on_unused_input='ignore') prediction = T.argmax(predy_inf, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function([ input_var, char_input_var, target_var, mask_var, mask_var1, length ], [corr_train, num_tokens, prediction, -cost_inf], on_unused_input='ignore')
def train(): global logfile_path global trainfile global train0file global test1file batch_size = int(256) filter_sizes = [1,2,3] num_filters = 1000 words_num_dim = 50 embedding_size = 300 learning_rate = 0.001 n_epochs = 9050 validation_freq = 50 keep_prob_value = 0.7 margin_size = 0.05 logfile_path = os.path.join(logfile_path, 'CNN-' + GetNowTime() + '-' \ + 'batch_size-' + str(batch_size) + '-' \ + 'num_filters-' + str(num_filters) + '-' \ + 'embedding_size-' + str(embedding_size) + '-' \ + 'n_epochs-' + str(n_epochs) + '-' \ + 'freq-' + str(validation_freq) + '-' \ + '-log.txt') log("New start ...", logfile_path) log(str(time.asctime(time.localtime(time.time()))), logfile_path) log("batch_size = " + str(batch_size), logfile_path) log("filter_sizes = " + str(filter_sizes), logfile_path) log("num_filters = " + str(num_filters), logfile_path) log("embedding_size = " + str(embedding_size), logfile_path) log("learning_rate = " + str(learning_rate), logfile_path) log("n_epochs = " + str(n_epochs), logfile_path) log("margin_size = " + str(margin_size), logfile_path) log("words_num_dim = " + str(words_num_dim), logfile_path) log("validation_freq = " + str(validation_freq), logfile_path) log("keep_prob_value = " + str(keep_prob_value), logfile_path) log("train_1_file = " + str(trainfile.split('/')[-1]), logfile_path) log("train_0_file = " + str(train0file.split('/')[-1]), logfile_path) log("test_file = " + str(test1file.split('/')[-1]), logfile_path) log("vector_file = " + str(vectorsfile.split('/')[-1]), logfile_path) vocab = build_vocab() #word_embeddings is list, shape = numOfWords*100 word_embeddings = load_word_embeddings(vocab, embedding_size) trainList = load_train_list() testList, qa_raw_testList = load_test_list() train0Dict = load_train0_dict() #train_x1.shape = 256*100 #train_x1, train_x2, train_x3 = load_train_data(trainList, vocab, batch_size, words_num_dim) train_x1, train_x2, train_x3 = load_train_data_from_2files(train0Dict, trainList, vocab, batch_size, words_num_dim) x1, x2, x3 = T.matrix('x1'), T.matrix('x2'), T.matrix('x3') keep_prob = T.fscalar('keep_prob') model = QACnn( input1=x1, input2=x2, input3=x3, keep_prob=keep_prob, word_embeddings=word_embeddings, batch_size=batch_size, sequence_len=train_x1.shape[1], embedding_size=embedding_size, filter_sizes=filter_sizes, num_filters=num_filters, margin_size=margin_size) dbg_x1 = model.dbg_x1 dbg_outputs_1 = model.dbg_outputs_1 cost, cos12, cos13 = model.cost, model.cos12, model.cos13 params, accuracy = model.params, model.accuracy grads = T.grad(cost, params) updates = [ (param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads) ] p1, p2, p3 = T.matrix('p1'), T.matrix('p2'), T.matrix('p3') prob = T.fscalar('prob') train_model = theano.function( [p1, p2, p3, prob], [cost, accuracy, dbg_x1, dbg_outputs_1], updates=updates, givens={ x1: p1, x2: p2, x3: p3, keep_prob: prob } ) v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3') validate_model = theano.function( inputs=[v1, v2, v3, prob], outputs=[cos12, cos13], #updates=updates, givens={ x1: v1, x2: v2, x3: v3, keep_prob: prob } ) epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 #train_x1, train_x2, train_x3 = load_train_data(trainList, vocab, batch_size) train_x1, train_x2, train_x3 = load_train_data_from_2files(train0Dict, trainList, vocab, batch_size, words_num_dim) #print train_x3.shape cost_ij, acc, dbg_x1, dbg_outputs_1 = train_model(train_x1, train_x2, train_x3, keep_prob_value) log('load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc), logfile_path) if epoch % validation_freq == 0: log('Evaluation ......', logfile_path) validation(validate_model, testList, vocab, batch_size, words_num_dim, qa_raw_testList)
def build_conv_nnet2_classif(use_gpu, isize, ksize, n_batch, downsample_ops=True, verbose=0, version=-1, check_isfinite=True): if use_gpu: shared_fn = tcn.shared_constructor else: shared_fn = shared isize1 = isize isize2 = isize if isinstance(isize, (tuple, )): isize1 = isize[0] isize2 = isize[1] shape_img = (n_batch, 1, isize1, isize2) n_kern = 20 # 6 were used in LeNet5 shape_kern = (n_kern, 1, ksize, ksize) n_kern1 = 30 # 16 were used in LeNet5 shape_kern1 = (n_kern1, n_kern, ksize, ksize) logical_hid_shape = tcn.blas.GpuConv.logical_output_shape_2d( (isize1, isize2), (ksize, ksize), 'valid') logical_hid_shape1 = tcn.blas.GpuConv.logical_output_shape_2d( (logical_hid_shape[0] // 2, logical_hid_shape[1] // 2), (ksize, ksize), 'valid') n_hid = n_kern1 * logical_hid_shape1[0] * logical_hid_shape1[1] n_out = 10 w0 = shared_fn(0.01 * (my_rand(*shape_kern) - 0.5), 'w0') b0 = shared_fn(my_zeros((n_kern, )), 'b0') w1 = shared_fn(0.01 * (my_rand(*shape_kern1) - 0.5), 'w1') b1 = shared_fn(my_zeros((n_kern1, )), 'b1') v = shared_fn(0.01 * my_randn(n_hid, n_out), 'v') c = shared_fn(my_zeros(n_out), 'c') # print 'ALLOCATING ARCH: w0 shape', w0.get_value(borrow=True).shape # print 'ALLOCATING ARCH: w1 shape', w1.get_value(borrow=True).shape # print 'ALLOCATING ARCH: v shape', v.get_value(borrow=True).shape x = tensor.Tensor(dtype='float32', broadcastable=(0, 1, 0, 0))('x') y = tensor.fmatrix('y') lr = tensor.fscalar('lr') conv_op = conv.ConvOp(shape_img[2:], shape_kern[2:], n_kern, n_batch, 1, 1, verbose=verbose, version=version) conv_op1 = conv.ConvOp( (n_kern, logical_hid_shape[0] // 2, logical_hid_shape[1] // 2), shape_kern1[2:], n_kern1, n_batch, 1, 1, verbose=verbose, version=version) ds_op = downsample.DownsampleFactorMax((2, 2), ignore_border=False) if downsample_ops: hid = tensor.tanh(ds_op(conv_op(x, w0) + b0.dimshuffle((0, 'x', 'x')))) else: hid = tensor.tanh((conv_op(x, w0) + b0.dimshuffle( (0, 'x', 'x')))[:, :, ::2, ::2]) hid1 = tensor.tanh(conv_op1(hid, w1) + b1.dimshuffle((0, 'x', 'x'))) hid_flat = hid1.reshape((n_batch, n_hid)) out = tensor.nnet.softmax(tensor.dot(hid_flat, v) + c) loss = tensor.sum( tensor.nnet.crossentropy_categorical_1hot(out, tensor.argmax( y, axis=1)) * lr) # print 'loss type', loss.type params = [w0, b0, w1, b1, v, c] gparams = tensor.grad(loss, params) mode = get_mode(use_gpu, check_isfinite) # print 'building pfunc ...' train = pfunc([x, y, lr], [loss], mode=mode, updates=[(p, p - g) for p, g in zip(params, gparams)]) if verbose: theano.printing.debugprint(train) if use_gpu: # Check that GpuConv is used topo = train.maker.fgraph.toposort() conv_ops = (tcn.blas.GpuConv, tcn.dnn.GpuDnnConv, tcn.dnn.GpuDnnConvGradI, tcn.dnn.GpuDnnConvGradW, tcn.blas.BaseGpuCorrMM) assert len([n for n in topo if isinstance(n.op, conv_ops)]) > 0 shape_target = (n_batch, n_out) return train, params, shape_img, shape_target, mode
def train(): batch_size = int(256) filter_sizes = [2, 3, 5] num_filters = 500 embedding_size = 100 learning_rate = 0.001 n_epochs = 2000000 validation_freq = 1000 keep_prob_value = 0.25 vocab = build_vocab() word_embeddings = load_word_embeddings(vocab, embedding_size) trainList = load_train_list() testList = load_test_list() train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size) x1, x2, x3 = T.matrix('x1'), T.matrix('x2'), T.matrix('x3') keep_prob = T.fscalar('keep_prob') model = QACnn(input1=x1, input2=x2, input3=x3, keep_prob=keep_prob, word_embeddings=word_embeddings, batch_size=batch_size, sequence_len=train_x1.shape[1], embedding_size=embedding_size, filter_sizes=filter_sizes, num_filters=num_filters) dbg_x1 = model.dbg_x1 dbg_outputs_1 = model.dbg_outputs_1 cost, cos12, cos13 = model.cost, model.cos12, model.cos13 print 'cost' print cost params, accuracy = model.params, model.accuracy grads = T.grad(cost, params) updates = [(param_i, param_i - learning_rate * grad_i) for param_i, grad_i in zip(params, grads)] p1, p2, p3 = T.matrix('p1'), T.matrix('p2'), T.matrix('p3') prob = T.fscalar('prob') train_model = theano.function([p1, p2, p3, prob], [cost, accuracy, dbg_x1, dbg_outputs_1], updates=updates, givens={ x1: p1, x2: p2, x3: p3, keep_prob: prob }) v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3') validate_model = theano.function( inputs=[v1, v2, v3, prob], outputs=[cos12, cos13], #updates=updates, givens={ x1: v1, x2: v2, x3: v3, keep_prob: prob }) epoch = 0 done_looping = False while (epoch < n_epochs) and (not done_looping): epoch = epoch + 1 train_x1, train_x2, train_x3 = load_data(trainList, vocab, batch_size) #print train_x3.shape cost_ij, acc, dbg_x1, dbg_outputs_1 = train_model( train_x1, train_x2, train_x3, keep_prob_value) print 'load data done ...... epoch:' + str(epoch) + ' cost:' + str( cost_ij) + ', acc:' + str(acc) if epoch % validation_freq == 0: print 'Evaluation ......' validation(validate_model, testList, vocab, batch_size)
def __init__(self, We_initial, char_embedd_table_initial, params): self.textfile = open(params.outfile, 'w') We = theano.shared(We_initial) # initial embedding for the InfNet We_inf = theano.shared(We_initial) embsize = We_initial.shape[1] hidden = params.hidden self.en_hidden_size = params.hidden_inf self.num_labels = 17 self.de_hidden_size = params.de_hidden_size char_embedd_dim = params.char_embedd_dim char_dic_size = len(params.char_dic) char_embedd_table = theano.shared(char_embedd_table_initial) char_embedd_table_inf = theano.shared(char_embedd_table_initial) input_var = T.imatrix(name='inputs') target_var = T.imatrix(name='targets') target_var_in = T.imatrix(name='targets') mask_var = T.fmatrix(name='masks') mask_var1 = T.fmatrix(name='masks1') char_input_var = T.itensor3(name='char-inputs') length = T.iscalar() length0 = T.iscalar() t_t = T.fscalar() t_t0 = T.fscalar() use_dropout = T.fscalar() use_dropout0 = T.fscalar() Wyy0 = np.random.uniform( -0.02, 0.02, (self.num_labels + 1, self.num_labels + 1)).astype('float32') Wyy = theano.shared(Wyy0) l_in_word = lasagne.layers.InputLayer((None, None)) l_mask_word = lasagne.layers.InputLayer(shape=(None, None)) if params.emb == 1: l_emb_word = lasagne.layers.EmbeddingLayer( l_in_word, input_size=We_initial.shape[0], output_size=embsize, W=We, name='word_embedding') else: l_emb_word = lasagne_embedding_layer_2(l_in_word, embsize, We) layer_char_input = lasagne.layers.InputLayer(shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char = lasagne.layers.reshape(layer_char_input, (-1, [2])) layer_char_embedding = lasagne.layers.EmbeddingLayer( layer_char, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table, name='char_embedding') layer_char = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # first get some necessary dimensions or parameters conv_window = 3 num_filters = params.num_filters # construct convolution layer cnn_layer = lasagne.layers.Conv1DLayer( layer_char, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn') # infer the pool size for pooling (pool size should go through all time step of cnn) _, _, pool_size = cnn_layer.output_shape # construct max pool layer pool_layer = lasagne.layers.MaxPool1DLayer(cnn_layer, pool_size=pool_size) # reshape the layer to match lstm incoming layer [batch * sent_length, num_filters, 1] --> [batch, sent_length, num_filters] output_cnn_layer = lasagne.layers.reshape(pool_layer, (-1, length, [1])) # finally, concatenate the two incoming layers together. incoming = lasagne.layers.concat([output_cnn_layer, l_emb_word], axis=2) l_lstm_wordf = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word) l_lstm_wordb = lasagne.layers.LSTMLayer(incoming, hidden, mask_input=l_mask_word, backwards=True) concat = lasagne.layers.concat([l_lstm_wordf, l_lstm_wordb], axis=2) l_reshape_concat = lasagne.layers.ReshapeLayer(concat, (-1, 2 * hidden)) l_local = lasagne.layers.DenseLayer( l_reshape_concat, num_units=self.num_labels + 1, nonlinearity=lasagne.nonlinearities.linear) network_params = lasagne.layers.get_all_params(l_local, trainable=True) network_params.append(Wyy) print len(network_params) f = open( 'NER_BiLSTM_CNN_CRF_.Batchsize_10_dropout_1_LearningRate_0.005_0.0_50_hidden_200.pickle', 'r') data = pickle.load(f) f.close() for idx, p in enumerate(network_params): p.set_value(data[idx]) self.params = [] self.hos = [] self.Cos = [] self.encoder_lstm_layers = [] self.decoder_lstm_layers = [] self.lstm_layers_num = 1 ei, di, dt = T.imatrices(3) #place holders decoderInputs0, em, em1, dm, tf, di0 = T.fmatrices(6) ci = T.itensor3() #### the last one is for the stary symbole self.de_lookuptable = theano.shared(name="Decoder LookUpTable", value=init_xavier_uniform( self.num_labels + 1, self.de_hidden_size), borrow=True) self.linear = theano.shared( name="Linear", value=init_xavier_uniform( self.de_hidden_size + 2 * self.en_hidden_size, self.num_labels), borrow=True) self.linear_bias = theano.shared( name="Hidden to Bias", value=np.asarray(np.random.randn(self.num_labels, ) * 0., dtype=theano.config.floatX), borrow=True) #self.hidden_decode = theano.shared(name="Hidden to Decode", value= init_xavier_uniform(2*hidden, self.de_hidden_size), borrow = True) #self.hidden_bias = theano.shared( # name="Hidden to Bias", # value=np.asarray(np.random.randn(self.de_hidden_size, )*0., dtype=theano.config.floatX) , # borrow=True # ) input_var_shuffle = input_var.dimshuffle(1, 0) mask_var_shuffle = mask_var.dimshuffle(1, 0) target_var_in_shuffle = target_var_in.dimshuffle(1, 0) target_var_shuffle = target_var.dimshuffle(1, 0) self.params += [ We_inf, self.linear, self.de_lookuptable, self.linear_bias ] ######[batch, sent_length, embsize] state_below = We_inf[input_var_shuffle.flatten()].reshape( (input_var_shuffle.shape[0], input_var_shuffle.shape[1], embsize)) ###### character word embedding layer_char_input_inf = lasagne.layers.InputLayer( shape=(None, None, Max_Char_Length), input_var=char_input_var, name='char-input') layer_char_inf = lasagne.layers.reshape(layer_char_input_inf, (-1, [2])) layer_char_embedding_inf = lasagne.layers.EmbeddingLayer( layer_char_inf, input_size=char_dic_size, output_size=char_embedd_dim, W=char_embedd_table_inf, name='char_embedding_inf') layer_char_inf = lasagne.layers.DimshuffleLayer( layer_char_embedding_inf, pattern=(0, 2, 1)) #layer_char_inf = lasagne.layers.DropoutLayer(layer_char_inf, p=0.5) cnn_layer_inf = lasagne.layers.Conv1DLayer( layer_char_inf, num_filters=num_filters, filter_size=conv_window, pad='full', nonlinearity=lasagne.nonlinearities.tanh, name='cnn_inf') pool_layer_inf = lasagne.layers.MaxPool1DLayer(cnn_layer_inf, pool_size=pool_size) output_cnn_layer_inf = lasagne.layers.reshape(pool_layer_inf, (-1, length, [1])) char_params = lasagne.layers.get_all_params(output_cnn_layer_inf, trainable=True) self.params += char_params ###### [batch, sent_length, num_filters] #char_state_below = lasagne.layers.get_output(output_cnn_layer_inf, {layer_char_input_inf:char_input_var}) char_state_below = lasagne.layers.get_output(output_cnn_layer_inf) char_state_below = dropout_layer(char_state_below, use_dropout, trng) char_state_shuff = char_state_below.dimshuffle(1, 0, 2) state_below = T.concatenate([state_below, char_state_shuff], axis=2) state_below = dropout_layer(state_below, use_dropout, trng) enclstm_f = LSTM(embsize + num_filters, self.en_hidden_size) enclstm_b = LSTM(embsize + num_filters, self.en_hidden_size, True) self.encoder_lstm_layers.append(enclstm_f) #append self.encoder_lstm_layers.append(enclstm_b) #append self.params += enclstm_f.params + enclstm_b.params #concatenate hs_f, Cs_f = enclstm_f.forward(state_below, mask_var_shuffle) hs_b, Cs_b = enclstm_b.forward(state_below, mask_var_shuffle) hs = T.concatenate([hs_f, hs_b], axis=2) Cs = T.concatenate([Cs_f, Cs_b], axis=2) hs0 = T.concatenate([hs_f[-1], hs_b[0]], axis=1) Cs0 = T.concatenate([Cs_f[-1], Cs_b[0]], axis=1) #self.hos += T.tanh(tensor.dot(hs0, self.hidden_decode) + self.hidden_bias), #self.Cos += T.tanh(tensor.dot(Cs0, self.hidden_decode) + self.hidden_bias), self.hos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), self.Cos += T.alloc(np.asarray(0., dtype=theano.config.floatX), input_var_shuffle.shape[1], self.de_hidden_size), Encoder = hs state_below = self.de_lookuptable[ target_var_in_shuffle.flatten()].reshape( (target_var_in_shuffle.shape[0], target_var_in_shuffle.shape[1], self.de_hidden_size)) for i in range(self.lstm_layers_num): declstm = LSTM(self.de_hidden_size, self.de_hidden_size) self.decoder_lstm_layers += declstm, #append self.params += declstm.params #concatenate ho, Co = self.hos[i], self.Cos[i] state_below, Cs = declstm.forward(state_below, mask_var_shuffle, ho, Co) decoder_lstm_outputs = T.concatenate([state_below, Encoder], axis=2) linear_outputs = T.dot(decoder_lstm_outputs, self.linear) + self.linear_bias[None, None, :] softmax_outputs, updates = theano.scan( fn=lambda x: T.nnet.softmax(x), sequences=[linear_outputs], ) def _NLL(pred, y, m): return -m * T.log(pred[T.arange(input_var.shape[0]), y]) """ costs, _ = theano.scan(fn=_NLL, sequences=[softmax_outputs, target_var_shuffle, mask_var_shuffle]) #loss = costs.sum() / mask_var.sum() + params.L2*sum(lasagne.regularization.l2(x) for x in self.params) loss = costs.sum() / mask_var.sum() updates = lasagne.updates.sgd(loss, self.params, self.eta) updates = lasagne.updates.apply_momentum(updates, self.params, momentum=0.9) ################################################### #### using the ground truth when training ################################################## self._train = theano.function( inputs=[ei, em, di, dm, dt], outputs=[loss, softmax_outputs], updates=updates, givens={input_var:ei, mask_var:em, target_var_in:di, decoderMask:dm, target_var:dt} ) """ def _step2(ctx_, state_, hs_, Cs_): hs, Cs = [], [] token_idxs = T.cast(state_.argmax(axis=-1), "int32") msk_ = T.fill((T.zeros_like(token_idxs, dtype="float32")), 1.) msk_ = msk_.dimshuffle('x', 0) state_below0 = self.de_lookuptable[token_idxs].reshape( (1, ctx_.shape[0], self.de_hidden_size)) for i, lstm in enumerate(self.decoder_lstm_layers): h, C = lstm.forward(state_below0, msk_, hs_[i], Cs_[i]) #mind msk hs += h[-1], Cs += C[-1], state_below0 = h hs, Cs = T.as_tensor_variable(hs), T.as_tensor_variable(Cs) state_below0 = state_below0.reshape( (ctx_.shape[0], self.de_hidden_size)) state_below0 = T.concatenate([ctx_, state_below0], axis=1) newpred = T.dot(state_below0, self.linear) + self.linear_bias[None, :] state_below = T.nnet.softmax(newpred) ##### the beging symbole probablity is 0 extra_p = T.zeros_like(hs[:, :, 0]) state_below = T.concatenate([state_below, extra_p.T], axis=1) return state_below, hs, Cs hs0, Cs0 = T.as_tensor_variable( self.hos, name="hs0"), T.as_tensor_variable(self.Cos, name="Cs0") train_outputs, _ = theano.scan(fn=_step2, sequences=[Encoder], outputs_info=[decoderInputs0, hs0, Cs0], n_steps=input_var_shuffle.shape[0]) predy = train_outputs[0].dimshuffle(1, 0, 2) predy = predy[:, :, :-1] * mask_var[:, :, None] predy0 = predy.reshape((-1, 17)) def inner_function(targets_one_step, mask_one_step, prev_label, tg_energy): """ :param targets_one_step: [batch_size, t] :param prev_label: [batch_size, t] :param tg_energy: [batch_size] :return: """ new_ta_energy = T.dot(prev_label, Wyy[:-1, :-1]) new_ta_energy_t = tg_energy + T.sum( new_ta_energy * targets_one_step, axis=1) tg_energy_t = T.switch(mask_one_step, new_ta_energy_t, tg_energy) return [targets_one_step, tg_energy_t] local_energy = lasagne.layers.get_output( l_local, { l_in_word: input_var, l_mask_word: mask_var, layer_char_input: char_input_var }) local_energy = local_energy.reshape((-1, length, 17)) local_energy = local_energy * mask_var[:, :, None] ##################### # for the end symbole of a sequence #################### end_term = Wyy[:-1, -1] local_energy = local_energy + end_term.dimshuffle( 'x', 'x', 0) * mask_var1[:, :, None] #predy0 = lasagne.layers.get_output(l_local_a, {l_in_word_a:input_var, l_mask_word_a:mask_var}) predy_in = T.argmax(predy0, axis=1) A = T.extra_ops.to_one_hot(predy_in, 17) A = A.reshape((-1, length, 17)) #predy = predy0.reshape((-1, length, 25)) #predy = predy*mask_var[:,:,None] targets_shuffled = predy.dimshuffle(1, 0, 2) target_time0 = targets_shuffled[0] masks_shuffled = mask_var.dimshuffle(1, 0) initial_energy0 = T.dot(target_time0, Wyy[-1, :-1]) initials = [target_time0, initial_energy0] [_, target_energies], _ = theano.scan( fn=inner_function, outputs_info=initials, sequences=[targets_shuffled[1:], masks_shuffled[1:]]) cost11 = target_energies[-1] + T.sum( T.sum(local_energy * predy, axis=2) * mask_var, axis=1) # compute the ground-truth energy targets_shuffled0 = A.dimshuffle(1, 0, 2) target_time00 = targets_shuffled0[0] initial_energy00 = T.dot(target_time00, Wyy[-1, :-1]) initials0 = [target_time00, initial_energy00] [_, target_energies0], _ = theano.scan( fn=inner_function, outputs_info=initials0, sequences=[targets_shuffled0[1:], masks_shuffled[1:]]) cost110 = target_energies0[-1] + T.sum( T.sum(local_energy * A, axis=2) * mask_var, axis=1) #predy_f = predy.reshape((-1, 25)) y_f = target_var.flatten() if (params.annealing == 0): lamb = params.L3 elif (params.annealing == 1): lamb = params.L3 * (1 - 0.01 * t_t) if (params.regutype == 0): ce_hinge = lasagne.objectives.categorical_crossentropy( predy0 + eps, y_f) ce_hinge = ce_hinge.reshape((-1, length)) ce_hinge = T.sum(ce_hinge * mask_var, axis=1) cost = T.mean(-cost11) + lamb * T.mean(ce_hinge) else: entropy_term = -T.sum(predy0 * T.log(predy0 + eps), axis=1) entropy_term = entropy_term.reshape((-1, length)) entropy_term = T.sum(entropy_term * mask_var, axis=1) cost = T.mean(-cost11) - lamb * T.mean(entropy_term) ##from adam import adam ##updates_a = adam(cost, self.params, params.eta) #updates_a = lasagne.updates.sgd(cost, self.params, params.eta) #updates_a = lasagne.updates.apply_momentum(updates_a, self.params, momentum=0.9) #norm = T.sqrt(sum(T.sum(updates_a[tensor]**2) for tensor in self.params)) #target_norm = T.clip(norm, 0, 10.0) #multiplier = target_norm / (1e-8 + norm) from momentum import momentum updates_a = momentum(cost, self.params, params.eta, momentum=0.9) if (params.regutype == 0): self.train_fn = theano.function( inputs=[ei, ci, dt, em, em1, length0, t_t0, di0, use_dropout0], outputs=[cost, ce_hinge], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, char_input_var: ci, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0, use_dropout: use_dropout0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, ce_hinge], updates = updates_a, on_unused_input='ignore') else: self.train_fn = theano.function( inputs=[ei, ci, dt, em, em1, length0, t_t0, di0, use_dropout0], outputs=[cost, entropy_term], updates=updates_a, on_unused_input='ignore', givens={ input_var: ei, char_input_var: ci, target_var: dt, mask_var: em, mask_var1: em1, length: length0, t_t: t_t0, decoderInputs0: di0, use_dropout: use_dropout0 }) #self.train_fn = theano.function([input_var, target_var, mask_var, mask_var1, length, t_t], [cost, entropy_term], updates = updates_a, on_unused_input='ignore') prediction = T.argmax(predy, axis=2) corr = T.eq(prediction, target_var) corr_train = (corr * mask_var).sum(dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) self.eval_fn = theano.function( inputs=[ei, ci, dt, em, em1, length0, di0, use_dropout0], outputs=[cost11, cost110, corr_train, num_tokens, prediction], on_unused_input='ignore', givens={ input_var: ei, char_input_var: ci, target_var: dt, mask_var: em, mask_var1: em1, length: length0, decoderInputs0: di0, use_dropout: use_dropout0 })
cost = T.sqr(pred_freq - target_freq).mean() #lib.load_params('iter_latest_wavenet.p') # cost = T.nnet.categorical_crossentropy( # predicted_sequences, # target_sequences.flatten() # ).mean() # By default we report cross-entropy cost in bits. # Switch to nats by commenting out this line: #cost = cost * lib.floatX(1.44269504089) params = lib.search(cost, lambda x: hasattr(x, 'param')) lib.print_params_info(cost, params) #updates = lib.optimizers.Adam(cost, params, 1e-3,gradClip=True,value=GRAD_CLIP) grads = T.grad(cost, wrt=params) lr = T.fscalar() updates = lasagne.updates.adam(grads, params, learning_rate=lr) print "Gradients Computed" train_fn = theano.function([sequences, lr], [cost, pred_freq], updates=updates, on_unused_input='warn') print "Training!" DATA_PATH = "/data/lisatmp3/kumarrit/blizzard" for epoch in xrange(NB_EPOCH): costs = [] times = [] #data_feeder = list(dataset.feed_epoch(DATA_PATH, N_FILES, BATCH_SIZE, SEQ_LEN, FRAME_SIZE, Q_LEVELS, Q_ZERO,RF)) data_feeder = list(
def __init__(self, x_dim, hidden_dim, y_dim, w_spread, p_drop): # parameters of the model self.wfx = theano.shared( name="wfx", value=w_spread * np.random.uniform(-1., 1., (x_dim + hidden_dim + 1, hidden_dim)).astype( theano.config.floatX), borrow=True) self.wbx = theano.shared( name="wbx", value=w_spread * np.random.uniform(-1., 1., (x_dim + hidden_dim + 1, hidden_dim)).astype( theano.config.floatX), borrow=True) self.wf1 = theano.shared( name="wf1", value=w_spread * np.random.uniform(-1., 1., (2 * hidden_dim + hidden_dim + 1, hidden_dim)).astype(theano.config.floatX), borrow=True) self.wb1 = theano.shared( name="wb1", value=w_spread * np.random.uniform(-1., 1., (2 * hidden_dim + hidden_dim + 1, hidden_dim)).astype(theano.config.floatX), borrow=True) self.wy = theano.shared( name="wy", value=w_spread * np.random.uniform( -1., 1., (2 * hidden_dim + 1, y_dim)).astype(theano.config.floatX), borrow=True) h_zeros = theano.shared(name="hfx_0", value=np.zeros(hidden_dim, dtype=theano.config.floatX), borrow=True) # bundle self.params = [self.wfx, self.wbx, self.wf1, self.wb1, self.wy] # define recurrent neural network # (for each input word predict all output tags) x = T.fmatrix("x") y = T.fmatrix("y") learn_rate = T.fscalar('learn_rate') activation = T.tanh #activation = T.nnet.sigmoid #activation = lambda x: x * (x > 0) # reLU #activation = lambda x: x * ((x > 0) + 0.01) #activation = lambda x: T.minimum(x * (x > 0), 6) # capped reLU def model(x, wfx, hfx_0, wbx, hbx_0, wf1, hf1_0, wb1, hb1_0, wy, p_drop): def recurrence_x(x_cur, h_prev, w, mask): h = activation(T.dot(T.concatenate([x_cur, h_prev, [one]]), w)) h_ = dropout_apply(h, mask, p_drop) return h_ def recurrence_h(f_cur, b_cur, h_prev, w, mask): h = activation( T.dot(T.concatenate([f_cur, b_cur, h_prev, [one]]), w)) h_ = dropout_apply(h, mask, p_drop) return h_ def recurrence_y(f_cur, b_cur, w): y = activation(T.dot(T.concatenate([f_cur, b_cur, [one]]), w)) return y one = np.float32(1.) if p_drop > 0.: masks = dropout_masks( p_drop, [hfx_0.shape, hbx_0.shape, hf1_0.shape, hb1_0.shape]) else: masks = [[]] * 4 hfx, _ = theano.scan(fn=recurrence_x, sequences=x, non_sequences=[wfx, masks[0]], outputs_info=[hfx_0], n_steps=x.shape[0]) hbx_rev, _ = theano.scan(fn=recurrence_x, sequences=x, non_sequences=[wbx, masks[1]], outputs_info=[hbx_0], n_steps=x.shape[0], go_backwards=True) hbx, _ = theano.scan(fn=lambda x: x, sequences=hbx_rev, n_steps=x.shape[0], go_backwards=True) hf1, _ = theano.scan(fn=recurrence_h, sequences=[hfx, hbx], non_sequences=[wf1, masks[2]], outputs_info=[hf1_0], n_steps=x.shape[0]) hb1_rev, _ = theano.scan(fn=recurrence_h, sequences=[hfx, hbx], non_sequences=[wb1, masks[3]], outputs_info=[hb1_0], n_steps=x.shape[0], go_backwards=True) hb1, _ = theano.scan(fn=lambda x: x, sequences=hb1_rev, n_steps=x.shape[0], go_backwards=True) y, _ = theano.scan(fn=recurrence_y, sequences=[hf1, hb1], non_sequences=[wy], outputs_info=[None], n_steps=x.shape[0]) return y y_pred = model(x, self.wfx, h_zeros, self.wbx, h_zeros, self.wf1, h_zeros, self.wb1, h_zeros, self.wy, 0.) y_noise = model(x, self.wfx, h_zeros, self.wbx, h_zeros, self.wf1, h_zeros, self.wb1, h_zeros, self.wy, p_drop) #loss = lambda y_pred, y: T.mean((y_pred - y) ** 2) # MSE #loss = lambda y_pred, y: T.sum((y_pred - y) ** 16) ** (1./16) #loss = lambda y_pred, y: T.max((y_pred - y) ** 2) loss = lambda y_pred, y: T.max(abs(y - y_pred)) + T.mean( (y - y_pred)**2) #loss = lambda y_pred, y: T.sum((y_pred - y) ** 16) ** (1./16) + T.mean((y - y_pred) ** 2) l1_reg = 0.001 l1 = sum([ T.mean(w) for w in [self.wfx, self.wbx, self.wf1, self.wb1, self.wy] ]) l2_reg = 0.001 l2 = sum([ T.mean(w**2) for w in [self.wfx, self.wbx, self.wf1, self.wb1, self.wy] ]) # define gradients and updates cost = loss(y_noise, y) + l1_reg * l1 + l2_reg * l2 #updates = sgd(cost, self.params, learn_rate) #updates = rmsprop(cost, self.params, learn_rate) updates = adam(cost, self.params, learn_rate) # compile theano functions self.predict = theano.function(inputs=[x], outputs=y_pred) self.train = theano.function( inputs=[x, y, learn_rate], outputs=[cost, T.min(y_noise), T.max(y_noise), T.mean(y_noise)], updates=updates)
args = setup() print('all argument: ', args) temp_lambda = None loss_change = [] tmp_weights = None random_seed(args.seed) if args.model == 'convnet': x = T.ftensor4('x') elif args.model == 'mlp': x = T.matrix('x') else: raise AttributeError y = T.matrix('y') lr_ele = T.fscalar('lr_ele') mom = args.momEle #momentum lr_hyper = T.fscalar('lr_hyper') grad_valid_weight = T.tensor4('grad_valid_weight') model = DenseNet(x=x, y=y, args=args) velocities = [theano.shared(np.asarray(param.get_value(borrow=True)*0., dtype=theano.config.floatX), broadcastable=param.broadcastable, name=param.name+'_vel') for param in model.params_theta] #extra lr parameters log_learning_rates = [theano.shared(np.full_like(param.get_value(borrow=True), np.log(args.lrEle), dtype=theano.config.floatX), broadcastable=param.broadcastable, name=param.name+'_llr') for param in model.params_theta] temp_llrs = None