def get_params(self): '''returns a list of the trainable parameters, that is, the query and context embeddings. (similar to layer.get_all_params.)''' return ( get_all_params(self.l_embed_query, trainable=True) + get_all_params(self.l_embed_context, trainable=True) )
def test_memory_cells(batch_size=3, seq_len=50, input_dim=8, n_hidden=16): # lasagne way l_in = InputLayer((None, seq_len, input_dim), input_var=theano.shared(np.random.normal(size=[batch_size, seq_len, input_dim])), name='input seq') l_lstm0 = LSTMLayer(l_in, n_hidden, name='lstm') l_gru0 = GRULayer(l_in, n_hidden, name='gru') f_predict0 = theano.function([], get_output([l_lstm0, l_gru0])) # agentnet way s_in = InputLayer((None, input_dim), name='in') s_prev_cell = InputLayer((None, n_hidden), name='cell') s_prev_hid = InputLayer((None, n_hidden), name='hid') s_lstm_cell, s_lstm_hid = LSTMCell(s_prev_cell, s_prev_hid, s_in, name='lstm') s_prev_gru = InputLayer((None, n_hidden), name='hid') s_gru = GRUCell(s_prev_gru, s_in, name='gru') rec = Recurrence(state_variables=OrderedDict({ s_lstm_cell: s_prev_cell, s_lstm_hid: s_prev_hid, s_gru: s_prev_gru}), input_sequences={s_in: l_in}, unroll_scan=False) state_seqs, _ = rec.get_sequence_layers() l_lstm1 = state_seqs[s_lstm_hid] l_gru1 = state_seqs[s_gru] f_predict1 = theano.function([], get_output([l_lstm1, l_gru1])) # lstm param transfer old_params = sorted(get_all_params(l_lstm0, trainable=True), key=lambda p: p.name) new_params = sorted(get_all_params(s_lstm_hid, trainable=True), key=lambda p: p.name) for old, new in zip(old_params, new_params): print (old.name, '<-', new.name) assert tuple(old.shape.eval()) == tuple(new.shape.eval()) old.set_value(new.get_value()) # gru param transfer old_params = sorted(get_all_params(l_gru0, trainable=True), key=lambda p: p.name) new_params = sorted(get_all_params(s_gru, trainable=True), key=lambda p: p.name) for old, new in zip(old_params, new_params): print (old.name, '<-', new.name) assert tuple(old.shape.eval()) == tuple(new.shape.eval()) old.set_value(new.get_value()) lstm0_out, gru0_out = f_predict0() lstm1_out, gru1_out = f_predict1() assert np.allclose(lstm0_out, lstm1_out) assert np.allclose(gru0_out, gru1_out)
def _compile(self): rc = self.rc # actor gradient step O = self.net.O V = ll.get_output(self.net.critic) params = self.net.actor_params regl_params = ll.get_all_params(self.net.actor, regularizable=True) regl = 0.5*rc['l2_actor']*tt.sum([tt.sum(p**2) for p in regl_params]) updates = rc['gradient_updates'](V.mean()+regl, params, learning_rate=rc['lr_actor']) self.update_actor = th.function([O], [V.mean()], updates=updates) # critic bellman error (test version, doesn't update parameters) U = tt.matrix() Q = ll.get_output(self.net.critic, inputs={self.net.actor: U}) Y = tt.matrix() J = 0.5*tt.mean((Y-Q)**2) self.J = th.function([O, U, Y], J) # critic bellman error (train version, does update parameters) regl_params = [p for p in ll.get_all_params(self.net.critic, regularizable=True) if p not in ll.get_all_params(self.net.actor)] regl = 0.5*rc['l2_critic']*tt.sum([tt.sum(p**2) for p in regl_params]) params = self.net.critic_params updates = rc['gradient_updates'](J+regl, params, learning_rate=rc['lr_critic']) self.update_critic = th.function([O, U, Y], J, updates=updates) # target network update updates = [] tau = rc['tau'] for p,tgt_p in zip(self.net.all_params, self.target_net.all_params): updates.append( (tgt_p, tau*p + (1-tau)*tgt_p) ) self.update_target = th.function([], [], updates=updates) # build cost function # TODO: handle this better through rc x = tt.vector() u = tt.vector() site_xpos = tt.matrix() # L2 costs c = 0.5*rc['l2_q']*tt.sum(x[:self.model['nq']]**2) c += 0.5*rc['l2_v']*tt.sum(x[-self.model['nv']:]**2) c += 0.5*rc['l2_u']*tt.sum(u**2) # Huber costs if rc['huber_site'] is not None: a = rc['huber_alpha'] d = site_xpos[0] - site_xpos[1] c += rc['huber_site']*(tt.sqrt(tt.sum(d**2) + a**2) - a) # compile cost function # TODO: remove need for 'on_unused_input' self.cost = th.function([x, u, site_xpos], c, on_unused_input='ignore')
def set_decoder_weights(decoder_1step): """ set 1step weights equal to training decoder/probas_predictor weights """ params_1step = get_all_params(decoder_1step) params_full = get_all_params(self.net['l_dist']) params_full_dict = {p.name: p for p in params_full} for param_1step in params_1step: # use Theano .get_value() and.set_value() methods, applied to the shared variables param_1step.set_value(params_full_dict[param_1step.name].get_value())
def test_get_all_params(self): from lasagne.layers import (InputLayer, DenseLayer, get_all_params) l1 = InputLayer((10, 20)) l2 = DenseLayer(l1, 30) l3 = DenseLayer(l2, 40) assert get_all_params(l3) == l2.get_params() + l3.get_params() assert (get_all_params(l3, regularizable=False) == (l2.get_params(regularizable=False) + l3.get_params(regularizable=False))) assert (get_all_params(l3, regularizable=True) == (l2.get_params(regularizable=True) + l3.get_params(regularizable=True)))
def build_optimizer(network, placeholders, optimization, learning_rate): # build loss function if optimization['objective'] == 'lower_bound': if 'binary' in optimization: binary = optimization['binary'] else: binary = False loss, prediction = variational_lower_bound(network, placeholders['inputs'], deterministic=False, binary=binary) # regularize parameters loss += regularization(network['X'], optimization) params = layers.get_all_params(network['X'], trainable=True) else: prediction = layers.get_output(network['output'], deterministic=False) loss = build_loss(placeholders['targets'], prediction, optimization) # regularize parameters loss += regularization(network['output'], optimization) params = layers.get_all_params(network['output'], trainable=True) # calculate and clip gradients if "weight_norm" in optimization: weight_norm = optimization['weight_norm'] else: weight_norm = None grad = calculate_gradient(loss, params, weight_norm=weight_norm) # setup parameter updates update_op = build_updates(grad, params, optimization, learning_rate) # test/validation set if optimization['objective'] == 'lower_bound': test_loss, test_prediction = variational_lower_bound(network, placeholders['inputs'], deterministic=False, binary=binary) else: test_prediction = layers.get_output(network['output'], deterministic=True) test_loss = build_loss(placeholders['targets'], test_prediction, optimization) # create theano function train_fun = theano.function(list(placeholders.values()), [loss, prediction], updates=update_op) test_fun = theano.function(list(placeholders.values()), [test_loss, test_prediction]) return train_fun, test_fun
def generate_theano_func(args, network, penalty, input_dict, target_var): prediction = get_output(network, input_dict) # loss = T.mean( target_var * ( T.log(target_var) - prediction )) loss = T.mean(categorical_crossentropy(prediction, target_var)) # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) ) # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params ) # penalty = regularize_layer_params(l_forward_1_lstm, l2) # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params) # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) ) loss = loss + penalty params = get_all_params(network, trainable=True) if args.optimizer == "sgd": updates = sgd(loss, params, learning_rate=args.step) elif args.optimizer == "adagrad": updates = adagrad(loss, params, learning_rate=args.step) elif args.optimizer == "adadelta": updates = adadelta(loss, params, learning_rate=args.step) elif args.optimizer == "nesterov": updates = nesterov_momentum(loss, params, learning_rate=args.step) elif args.optimizer == "rms": updates = rmsprop(loss, params, learning_rate=args.step) elif args.optimizer == "adam": updates = adam(loss, params, learning_rate=args.step) else: raise "Need set optimizer correctly" test_prediction = get_output(network, input_dict, deterministic=True) # test_prediction = get_output(network, deterministic=True) # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction)) test_loss = T.mean(categorical_crossentropy(test_prediction, target_var)) train_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], loss, updates=updates, allow_input_downcast=True, ) if args.task == "sts": val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_prediction], allow_input_downcast=True, ) elif args.task == "ent": # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) test_acc = T.mean(categorical_accuracy(test_prediction, target_var)) val_fn = theano.function( [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var], [test_loss, test_acc], allow_input_downcast=True, ) return train_fn, val_fn
def test_maxpool_layer(): l_in1 = InputLayer((None, 2)) l_in2 = InputLayer((None, 20)) l_hid = DenseLayer(l_in2, num_units=30, nonlinearity=rectify) l_pool = MaxpoolLayer([l_in1, l_hid]) l_out = DenseLayer(l_pool, num_units=1, nonlinearity=sigmoid) bounds = theano.tensor.lmatrix('bounds') data = theano.tensor.matrix('data') targets = theano.tensor.matrix('targets') predictions = get_output(l_out, {l_in1: bounds, l_in2: data}) loss = categorical_crossentropy(predictions, targets) loss = aggregate(loss, mode='mean') params = get_all_params(l_out) updates_sgd = sgd(loss, params, learning_rate=0.0001) train_function = theano.function([bounds, data, targets], updates=updates_sgd, allow_input_downcast=True) test_bounds = np.array([[0, 3], [3, 5], [5, 7]]) test_X = np.random.randn(10, 20) test_Y = np.array([[0], [1], [0]]) train_function(test_bounds, test_X, test_Y)
def init_model(self): print('Initializing model...') ra_input_var = T.tensor3('raw_audio_input') mc_input_var = T.tensor3('melody_contour_input') target_var = T.imatrix('targets') network = self.build_network(ra_input_var, mc_input_var) prediction = layers.get_output(network) prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = layers.get_all_params(network, trainable=True) updates = lasagne.updates.sgd(loss, params, learning_rate=0.02) test_prediction = layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var) test_loss = test_loss.mean() test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), T.argmax(target_var, axis=1)), dtype=theano.config.floatX) print('Building functions...') self.train_fn = theano.function([ra_input_var, mc_input_var, target_var], [loss, prediction], updates=updates, on_unused_input='ignore') self.val_fn = theano.function([ra_input_var, mc_input_var, target_var], [test_loss, test_acc, test_prediction], on_unused_input='ignore') self.run_fn = theano.function([ra_input_var, mc_input_var], [prediction], on_unused_input='ignore')
def __init__(self, dims, nonlinearities=None, dropouts=None, update_fn=None, batch_norm=False, loss_type='cosine_margin', margin=0.8): """Initialize a Siamese neural network Parameters: ----------- update_fn: theano function with 2 arguments (loss, params) Update scheme, default to adadelta batch_norm: bool Do batch normalisation on first layer, default to false """ assert len(dims) >= 3, 'Not enough dimmensions' if dropouts != None: dropouts = copy.copy(dropouts) assert len(dropouts) == len(dims) - 1 dropouts.append(0) else: dropouts = [0] * len(dims) if nonlinearities==None: nonlinearities = [nl.sigmoid] * (len(dims) -1) else: assert len(nonlinearities) == len(dims) - 1 if update_fn == None: update_fn = lasagne.updates.adadelta self.input_var1 = T.matrix('inputs1') self.input_var2 = T.matrix('inputs2') self.target_var = T.ivector('targets') # input layer network1 = layers.InputLayer((None, dims[0]), input_var=self.input_var1) network2 = layers.InputLayer((None, dims[0]), input_var=self.input_var2) if dropouts[0]: network1 = layers.DropoutLayer(network1, p=dropouts[0]) network2 = layers.DropoutLayer(network2, p=dropouts[0]) # hidden layers for dim, dropout, nonlin in zip(dims[1:], dropouts[1:], nonlinearities): network1 = layers.DenseLayer(network1, num_units=dim, W=lasagne.init.GlorotUniform(), nonlinearity=nonlin) network2 = layers.DenseLayer(network2, num_units=dim, W=network1.W, b=network1.b, nonlinearity=nonlin) if batch_norm: network1 = layers.batch_norm(network1) network2 = layers.batch_norm(network2) if dropout: network1 = layers.DropoutLayer(network1, p=dropout) network2 = layers.DropoutLayer(network2, p=dropout) self.network = [network1, network2] self.params = layers.get_all_params(network1, trainable=True) # util functions, completely stolen from Lasagne example self.prediction1 = layers.get_output(network1) self.prediction2 = layers.get_output(network2) # if non-determnistic: self.test_prediction1 = layers.get_output(network1, deterministic=True) self.test_prediction2 = layers.get_output(network2, deterministic=True) self.change_loss(loss_type, margin) self.change_update(update_fn)
def parameter_analysis(layer): all_params = ll.get_all_param_values(layer, trainable=True) param_names = [p.name for p in ll.get_all_params(layer, trainable=True)] print_gradinfo(param_names, {'nneg':[np.count_nonzero(p < 0) / np.product(p.shape) for p in all_params], 'norm':[np.linalg.norm(p) for p in all_params], 'shape':[p.shape for p in all_params]}) """
def build_model(self, train_x, test_x, valid_x, update, update_args): self.train_x = train_x self.test_x = test_x self.validation_x = valid_x self.update = update self.update_args = update_args self.index = T.iscalar('index') self.batch_slice = slice(self.index * self.batch_size, (self.index + 1) * self.batch_size) x = self.srng.binomial(size=self.x.shape, n=1, p=self.x) log_pz, log_qz_given_x, log_px_given_z = self.model.get_log_distributions(self.x) loss_eval = (log_pz + log_px_given_z - log_qz_given_x).sum() loss_eval /= self.batch_size all_params = get_all_params(self.model) updates = self.update(-loss_eval, all_params, *self.update_args) train_model = theano.function([self.index], loss_eval, updates=updates, givens={self.x: self.train_x[self.batch_slice], },) test_model = theano.function([self.index], loss_eval, givens={self.x: self.test_x[self.batch_slice], },) validate_model = theano.function([self.index], loss_eval, givens={self.x: self.validation_x[self.batch_slice], },) return train_model, test_model, validate_model
def makeRegressionNetwork(self,n_in,n_hidden,n_out,learning_rate=0.001): """ build a feedforward neural network with regression output """ #network input input_ = T.matrix('input_') # matrix of shape batch size times number of input variables target_ = T.matrix('target_')# matrix of shape batch size times number of output variables #network l_input=layers.InputLayer((None,n_in)) l_hid=layers.DenseLayer(l_input,num_units=n_hidden) self.l_out=layers.DenseLayer(l_hid,num_units=n_out,nonlinearity=None) #network output l_outvalue = layers.get_output(self.l_out, input_) self.predict=theano.function([input_],l_outvalue,allow_input_downcast=True) #loss/cost function loss = T.mean(lasagne.objectives.squared_error(l_outvalue, target_)) #calculate the updates params = layers.get_all_params(self.l_out) updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=learning_rate, momentum=0.9) #update the weights from a given input and target self.train_function = theano.function([input_, target_],loss, updates=updates,allow_input_downcast=True)
def _get_train_fun(self): output_probs = get_output(self.net['l_dist']) # "long" 2d matrix with prob distribution input_ids = T.imatrix() # cut off the first ids from every id sequence: they correspond to START_TOKEN, that we are not predicting target_ids = input_ids[:, 1:] target_ids_flattened = target_ids.flatten() # "long" vector with target ids cost = categorical_crossentropy( predictions=output_probs, targets=target_ids_flattened ).mean() all_params = get_all_params(self.net['l_dist'], trainable=True) print("Computing train updates...") updates = lasagne.updates.adadelta( loss_or_grads=cost, params=all_params, learning_rate=LEARNING_RATE ) print("Compiling train function...") train_fun = theano.function( inputs=[self.net['l_in_x'].input_var, self.net['l_in_y'].input_var, input_ids], outputs=cost, updates=updates ) return train_fun
def test_get_all_params_with_unwrap_shared(self): from lasagne.layers import (InputLayer, DenseLayer, get_all_params) import theano.tensor as T from lasagne.utils import floatX l1 = InputLayer((10, 20)) l2 = DenseLayer(l1, 30) W1 = theano.shared(floatX(numpy.zeros((30, 2)))) W2 = theano.shared(floatX(numpy.zeros((2, 40)))) W_expr = T.dot(W1, W2) l3 = DenseLayer(l2, 40, W=W_expr, b=None) l2_params = get_all_params(l2) assert get_all_params(l3) == l2_params + [W1, W2] assert get_all_params(l3, unwrap_shared=False) == l2_params + [W_expr]
def get_model(input_var, target_var, multiply_var): # input layer with unspecified batch size layer = InputLayer(shape=(None, 12, 64, 64), input_var=input_var) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var) layer = DimshuffleLayer(layer, (0, 'x', 1, 2, 3)) # Z-score? # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = Conv3DDNNLayer(incoming=layer, num_filters=1, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=sigmoid) layer_prediction = layer # Loss prediction = get_output(layer_prediction) loss = binary_crossentropy(prediction[:,0,:,:,:], target_var).mean() #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum params = get_all_params(layer_prediction, trainable=True) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, disabling dropout layers. test_prediction = get_output(layer_prediction, deterministic=True) test_loss = binary_crossentropy(test_prediction[:,0,:,:,:], target_var).mean() return test_prediction, prediction, loss, params
def __init__(self, sound_shape, num_units, main_layer_class, loss_func, updates_func): # входной тензор (кол-во батчей, кол-во записей, время, частота) input_X = T.tensor4("X") # сеть input_layer = InputLayer(shape=(None, 3) + sound_shape, input_var=input_X.swapaxes(2, 3)) all_output = main_layer_class(input_layer, sound_shape, num_units) # for loss vector_output = ReshapeLayer(all_output, (-1, 1, num_units)) # for use # предсказание нейронки all_predicted = get_output(all_output) # for loss vector_predicted = get_output(vector_output) # for use # функция ошибки loss = loss_func(all_predicted) # посчитать обновлённые веса с шагом по градиенту trainable_weights = get_all_params(all_output, trainable=True) updates_sgd = updates_func(loss, trainable_weights) # функция, которая обучает сеть на 1 шаг и возвращащет значение функции потерь self.fit = theano.function([input_X], loss, updates=updates_sgd) # функция, которая возвращает вектор голоса self.predict = theano.function([input_X], vector_predicted) self.all_output = all_output self.vector_output = vector_output self.all_predicted = all_predicted self.vector_predicted = vector_predicted
def create_encoder_decoder_func(layers, apply_updates=False): X = T.fmatrix('X') X_batch = T.fmatrix('X_batch') X_hat = get_output(layers['l_decoder_out'], X, deterministic=False) # reconstruction loss encoder_decoder_loss = T.mean( T.mean(T.sqr(X - X_hat), axis=1) ) if apply_updates: # all layers that participate in the forward pass should be updated encoder_decoder_params = get_all_params( layers['l_decoder_out'], trainable=True) encoder_decoder_updates = nesterov_momentum( encoder_decoder_loss, encoder_decoder_params, 0.01, 0.9) else: encoder_decoder_updates = None encoder_decoder_func = theano.function( inputs=[theano.In(X_batch)], outputs=encoder_decoder_loss, updates=encoder_decoder_updates, givens={ X: X_batch, }, ) return encoder_decoder_func
def create_iter_funcs_train(l_out, lr, mntm, wd): X = T.tensor4('X') y = T.ivector('y') X_batch = T.tensor4('X_batch') y_batch = T.ivector('y_batch') y_hat = layers.get_output(l_out, X, deterministic=False) # softmax loss train_loss = T.mean( T.nnet.categorical_crossentropy(y_hat, y)) # L2 regularization train_loss += wd * regularize_network_params(l_out, l2) train_acc = T.mean( T.eq(y_hat.argmax(axis=1), y)) all_params = layers.get_all_params(l_out, trainable=True) updates = lasagne.updates.nesterov_momentum( train_loss, all_params, lr, mntm) train_iter = theano.function( inputs=[theano.Param(X_batch), theano.Param(y_batch)], outputs=[train_loss, train_acc], updates=updates, givens={ X: X_batch, y: y_batch, }, ) return train_iter
def _create_nnet(input_dims, output_dims, learning_rate, num_hidden_units=15, batch_size=32, max_train_epochs=1, hidden_nonlinearity=nonlinearities.rectify, output_nonlinearity=None, update_method=updates.sgd): """ A subclass may override this if a different sort of network is desired. """ nnlayers = [] nnlayers.append(layers.InputLayer(shape=(None, input_dims))) nnlayers.append(layers.DenseLayer(nnlayers[-1], num_hidden_units, nonlinearity=hidden_nonlinearity)) nnlayers.append(layers.DenseLayer(nnlayers[-1], output_dims, nonlinearity=output_nonlinearity)) prediction = layers.get_output(nnlayers[-1]) input_var = nnlayers[0].input_var target = T.matrix(name="target", dtype=floatX) loss = objectives.squared_error(prediction, target).mean() params = layers.get_all_params(nnlayers[-1], trainable=True) updates = update_method(loss, params, learning_rate) fit = theano.function([input_var, target], loss, updates=updates) predict = theano.function([input_var], prediction) nnet = Mock( fit=fit, predict=predict, ) return nnet
def get_model(input_images, input_position, input_mult, target_var): # number of SAX and distance between SAX slices #indexes = [] #for i in range(input_position.shape[0]): # indexes.append(numpy.where(input_position[i][:,0] == 0.)[0][0]) # input layer with unspecified batch size layer = InputLayer(shape=(None, 22, 30, 64, 64), input_var=input_images) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var) # Z-score? # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) shortcut = layer layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = ElemwiseSumLayer([layer, shortcut]) layer = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify)) layer = Conv3DDNNLayer(incoming=layer, num_filters=22, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=sigmoid) layer_max = ExpressionLayer(layer, lambda X: X.max(1), output_shape='auto') layer_min = ExpressionLayer(layer, lambda X: X.min(1), output_shape='auto') layer_prediction = layer # image prediction prediction = get_output(layer_prediction) loss = binary_crossentropy(prediction, target_var).mean() #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum params = get_all_params(layer_prediction, trainable=True) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, disabling dropout layers. test_prediction = get_output(layer_prediction, deterministic=True) test_loss = binary_crossentropy(test_prediction, target_var).mean() return test_prediction, prediction, loss, params
def load_weights(layer, filename): with open(filename, 'rb') as f: src_params_list = pickle.load(f) dst_params_list = get_all_params(layer) # assign the parameter values stored on disk to the model for src_params, dst_params in zip(src_params_list, dst_params_list): dst_params.set_value(src_params)
def init_weights(l_out, init_file): print('loading weights from %s' % (init_file)) with open(init_file, 'rb') as ifile: src_layers = pickle.load(ifile) dst_layers = layers.get_all_params(l_out) for i, (src_weights, dst_layer) in enumerate( zip(src_layers, dst_layers)): print('loading pretrained weights for %s' % (dst_layer.name)) dst_layer.set_value(src_weights)
def build_treatment_model(self, n_vars, **kwargs): input_vars = TT.matrix() instrument_vars = TT.matrix() targets = TT.vector() inputs = layers.InputLayer((None, n_vars), input_vars) inputs = layers.DropoutLayer(inputs, p=0.2) dense_layer = layers.DenseLayer(inputs, 2 * kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) dense_layer= layers.DropoutLayer(dense_layer, p=0.2) for _ in xrange(kwargs['n_dense_layers'] - 1): dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) self.treatment_output = layers.DenseLayer(dense_layer, 1, nonlinearity=nonlinearities.linear) init_params = layers.get_all_param_values(self.treatment_output) prediction = layers.get_output(self.treatment_output, deterministic=False) test_prediction = layers.get_output(self.treatment_output, deterministic=True) l2_cost = regularization.regularize_network_params(self.treatment_output, regularization.l2) loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost params = layers.get_all_params(self.treatment_output, trainable=True) param_updates = updates.adadelta(loss, params) self._train_fn = theano.function( [ input_vars, targets, instrument_vars, ], loss, updates=param_updates ) self._loss_fn = theano.function( [ input_vars, targets, instrument_vars, ], loss, ) self._output_fn = theano.function( [ input_vars, ], test_prediction, ) return init_params
def create_discriminator_func(layers, apply_updates=False): X = T.fmatrix('X') pz = T.fmatrix('pz') X_batch = T.fmatrix('X_batch') pz_batch = T.fmatrix('pz_batch') # the discriminator receives samples from q(z|x) and p(z) # and should predict to which distribution each sample belongs discriminator_outputs = get_output( layers['l_discriminator_out'], inputs={ layers['l_prior_in']: pz, layers['l_encoder_in']: X, }, deterministic=False, ) # label samples from q(z|x) as 1 and samples from p(z) as 0 discriminator_targets = T.vertical_stack( T.ones((X_batch.shape[0], 1)), T.zeros((pz_batch.shape[0], 1)) ) discriminator_loss = T.mean( T.nnet.binary_crossentropy( discriminator_outputs, discriminator_targets, ) ) if apply_updates: # only layers that are part of the discriminator should be updated discriminator_params = get_all_params( layers['l_discriminator_out'], trainable=True, discriminator=True) discriminator_updates = nesterov_momentum( discriminator_loss, discriminator_params, 0.1, 0.0) else: discriminator_updates = None discriminator_func = theano.function( inputs=[ theano.In(X_batch), theano.In(pz_batch), ], outputs=discriminator_loss, updates=discriminator_updates, givens={ X: X_batch, pz: pz_batch, }, ) return discriminator_func
def get_model(input_var, target_var, multiply_var): # input layer with unspecified batch size layer_input = InputLayer(shape=(None, 30, 80, 80), input_var=input_var) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var) layer_0 = DimshuffleLayer(layer_input, (0, 'x', 1, 2, 3)) # Z-score? # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer_1 = batch_norm(Conv3DDNNLayer(incoming=layer_0, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_2 = batch_norm(Conv3DDNNLayer(incoming=layer_1, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_3 = MaxPool3DDNNLayer(layer_2, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1)) layer_4 = DropoutLayer(layer_3, p=0.25) # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer_5 = batch_norm(Conv3DDNNLayer(incoming=layer_4, num_filters=32, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_6 = batch_norm(Conv3DDNNLayer(incoming=layer_5, num_filters=32, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_7 = MaxPool3DDNNLayer(layer_6, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1)) layer_8 = DropoutLayer(layer_7, p=0.25) # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer layer_5 = batch_norm(Conv3DDNNLayer(incoming=layer_8, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_6 = batch_norm(Conv3DDNNLayer(incoming=layer_5, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_7 = batch_norm(Conv3DDNNLayer(incoming=layer_6, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify)) layer_8 = MaxPool3DDNNLayer(layer_7, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1)) layer_9 = DropoutLayer(layer_8, p=0.25) # LSTM layer = DimshuffleLayer(layer_9, (0,2,1,3,4)) # layer_prediction = LSTMLayer(layer, num_units=2, only_return_final=True, learn_init=True, cell=Gate(linear)) layer = LSTMLayer(layer, num_units=2, only_return_final=True, learn_init=True) layer_prediction = DenseLayer(layer, 2, nonlinearity=linear) # Output Layer # layer_hidden = DenseLayer(layer_flatten, 500, nonlinearity=linear) # layer_prediction = DenseLayer(layer_hidden, 2, nonlinearity=linear) # Loss prediction = get_output(layer_prediction) / multiply_var**2 loss = T.abs_(prediction - target_var) loss = loss.mean() #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum params = get_all_params(layer_prediction, trainable=True) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, disabling dropout layers. test_prediction = get_output(layer_prediction, deterministic=True) / multiply_var**2 test_loss = T.abs_(test_prediction - target_var) test_loss = test_loss.mean() # crps estimate crps = T.abs_(test_prediction - target_var).mean()/600 return test_prediction, crps, loss, params
def create_network(available_actions_num): # Creates the input variables s1 = tensor.tensor4("States") a = tensor.vector("Actions", dtype="int32") q2 = tensor.vector("Next State best Q-Value") r = tensor.vector("Rewards") nonterminal = tensor.vector("Nonterminal", dtype="int8") # Creates the input layer of the network. dqn = InputLayer(shape=[None, 1, downsampled_y, downsampled_x], input_var=s1) # Adds 3 convolutional layers, each followed by a max pooling layer. dqn = Conv2DLayer(dqn, num_filters=32, filter_size=[8, 8], nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) dqn = MaxPool2DLayer(dqn, pool_size=[2, 2]) dqn = Conv2DLayer(dqn, num_filters=64, filter_size=[4, 4], nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) dqn = MaxPool2DLayer(dqn, pool_size=[2, 2]) dqn = Conv2DLayer(dqn, num_filters=64, filter_size=[3, 3], nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) dqn = MaxPool2DLayer(dqn, pool_size=[2, 2]) # Adds a single fully connected layer. dqn = DenseLayer(dqn, num_units=512, nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) # Adds a single fully connected layer which is the output layer. # (no nonlinearity as it is for approximating an arbitrary real function) dqn = DenseLayer(dqn, num_units=available_actions_num, nonlinearity=None) # Theano stuff q = get_output(dqn) # Only q for the chosen actions is updated more or less according to following formula: # target Q(s,a,t) = r + gamma * max Q(s2,_,t+1) target_q = tensor.set_subtensor(q[tensor.arange(q.shape[0]), a], r + discount_factor * nonterminal * q2) loss = squared_error(q, target_q).mean() # Updates the parameters according to the computed gradient using rmsprop. params = get_all_params(dqn, trainable=True) updates = rmsprop(loss, params, learning_rate) # Compiles theano functions print "Compiling the network ..." function_learn = theano.function([s1, q2, a, r, nonterminal], loss, updates=updates, name="learn_fn") function_get_q_values = theano.function([s1], q, name="eval_fn") function_get_best_action = theano.function([s1], tensor.argmax(q), name="test_fn") print "Network compiled." # Returns Theano objects for the net and functions. # We wouldn't need the net anymore but it is nice to save your model. return dqn, function_learn, function_get_q_values, function_get_best_action
def build_model0(input_var,target_var,regularW=0,params_load=None): network=layers.InputLayer(shape=(None,3,256,256),input_var=input_var) # size 256*256 network=layers.Pool2DLayer(network,pool_size=(2,2),stride=2,pad=0,mode='average_inc_pad') #size 128*128 network=layers.Pool2DLayer(network,pool_size=(2,2),stride=2,pad=0,mode='average_inc_pad') #size 64*64 network=layers.Conv2DLayer(network,num_filters=32,filter_size=(5,5), nonlinearity=nonLinear.leaky_rectify, W=init.GlorotUniform(gain='relu'),pad='same' ) network=layers.MaxPool2DLayer(network,pool_size=(2,2)) network=layers.DropoutLayer(network,p=0.15) #size 32*32 network=layers.Conv2DLayer(network,num_filters=64,filter_size=(5,5), nonlinearity=nonLinear.leaky_rectify, W=init.GlorotUniform(gain='relu'),pad='same' ) network=layers.MaxPool2DLayer(network,pool_size=(2,2)) network=layers.DropoutLayer(network,p=0.2) #size 16*16 network=layers.Conv2DLayer(network,num_filters=128,filter_size=(5,5), nonlinearity=nonLinear.leaky_rectify, W=init.GlorotUniform(gain='relu'),pad='same' ) network=layers.MaxPool2DLayer(network,pool_size=(2,2)) network=layers.DropoutLayer(network,p=0.3) #size 8*8 network=layers.Conv2DLayer(network,num_filters=256,filter_size=(5,5), nonlinearity=nonLinear.leaky_rectify, W=init.GlorotUniform(gain='relu'),pad='same' ) network=layers.MaxPool2DLayer(network,pool_size=(2,2)) network=layers.DropoutLayer(network,p=0.4) #size 4*4 network = layers.GlobalPoolLayer(network) network=layers.DenseLayer(network,num_units=1000, nonlinearity=nonLinear.leaky_rectify, W=init.GlorotUniform(gain='relu')) network=layers.DenseLayer(network,num_units=2, nonlinearity=nonLinear.softmax) prediction=layers.get_output(network) loss = objectives.categorical_crossentropy(prediction, target_var) loss=loss.mean() params=layers.get_all_params(network,trainable=True) if params_load != None: [p.set_value(pval) for (p, pval) in zip(params, params_load)] return network,loss,params
def triplet_loss_iter(embedder, update_params={}): X_triplets = { 'anchor':T.tensor4(), 'positive':T.tensor4(), 'negative':T.tensor4(), } # each will be a batch of images final_emb_layer = embedder[-1] all_layers = ll.get_all_layers(embedder) imwrite_architecture(all_layers, './layer_rep.png') # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred) # another assumption (which must hold when the network is being made) # the last prediction layer is a) the end of the network and b) what we ultimately care about # however the other prediction layers will be incorporated into the training loss predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_triplets.items()} predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_triplets.items()} # each output should be batch_size x embed_size # should give us a vector of batch_size of distances btw anchor and positive alpha = 0.2 # FaceNet alpha triplet_pos = lambda pred: (pred['anchor'] - pred['positive']).norm(2,axis=1) triplet_neg = lambda pred: (pred['anchor'] - pred['negative']).norm(2,axis=1) triplet_distances = lambda pred: (triplet_pos(pred) - triplet_neg(pred) + alpha).clip(0, np.inf) triplet_failed = lambda pred: T.mean(triplet_distances(pred) > alpha) triplet_loss = lambda pred: T.sum(triplet_distances(pred)) decay = 0.001 reg = regularize_network_params(final_emb_layer, l2) * decay losses_reg = lambda pred: triplet_loss(pred) + reg loss_train = losses_reg(predicted_embeds_train) loss_train.name = 'TL' # for the names #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder])) all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots' grads = T.grad(loss_train, all_params, add_names=True) updates = adam(grads, all_params) #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum']) print("Compiling network for training") tic = time.time() train_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [loss_train] + grads, updates=updates) toc = time.time() - tic print("Took %0.2f seconds" % toc) #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True) print("Compiling network for validation") tic = time.time() valid_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [triplet_loss(predicted_embeds_valid), losses_reg(predicted_embeds_valid), triplet_failed(predicted_embeds_valid)]) toc = time.time() - tic print("Took %0.2f seconds" % toc) return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
def regularization(network, optimization): all_params = layers.get_all_params(network, regularizable=True) # weight-decay regularization loss = 0 if "l1" in optimization: l1_penalty = apply_penalty(all_params, l1) * optimization["l1"] loss += l1_penalty if "l2" in optimization: l2_penalty = apply_penalty(all_params, l2)* optimization["l2"] loss += l2_penalty return loss
def init_mdn(self, svi=False, n_components=1, rank=None, mdn_actfun=lnl.tanh, homoscedastic=False, min_precisions=None, **unused_kwargs): """ :param svi: bool Whether to use SVI version or not :param n_components: int :param rank: int :param homoscedastic: bool :param unused_kwargs: dict :param mdn_actfun: lasagne nonlinearity activation function for hidden units :param min_precisions: minimum values for diagonal elements of precision matrix for all components (usually taken to be prior precisions) :return: None """ self.svi, self.n_components, self.rank, self.mdn_actfun,\ self.homoscedastic, self.min_precisions = \ svi, n_components, rank, mdn_actfun, homoscedastic, min_precisions for key in unused_kwargs.keys(): print("MDN ignoring unused input {0}".format(key)) # hidden layers for l in range(len(self.n_hiddens)): self.layer['hidden_' + str(l + 1)] = dl.FullyConnectedLayer( last(self.layer), n_units=self.n_hiddens[l], actfun=self.mdn_actfun, svi=self.svi, name='h' + str(l + 1)) last_hidden = last(self.layer) # mixture layers self.layer['mixture_weights'] = dl.MixtureWeightsLayer(last_hidden, n_units=self.n_components, actfun=lnl.softmax, svi=self.svi, name='weights') self.layer['mixture_means'] = dl.MixtureMeansLayer(last_hidden, n_components=self.n_components, n_dim=self.n_outputs, svi=self.svi, name='means') if self.homoscedastic: PrecisionsLayer = dl.MixtureHomoscedasticPrecisionsLayer else: PrecisionsLayer = dl.MixturePrecisionsLayer # why is homoscedastic an input to the layer init? self.layer['mixture_precisions'] = PrecisionsLayer(last_hidden, n_components=self.n_components, n_dim=self.n_outputs, svi=self.svi, name='precisions', rank=self.rank, homoscedastic=self.homoscedastic, min_precisions=min_precisions) last_mog = [self.layer['mixture_weights'], self.layer['mixture_means'], self.layer['mixture_precisions']] # mixture parameters # a : weights, matrix with shape (batch, n_components) # ms : means, list of len n_components with (batch, n_dim, n_dim) # Us : precision factors, n_components list with (batch, n_dim, n_dim) # ldetUs : log determinants of precisions, n_comp list with (batch, ) self.a, self.ms, precision_out = ll.get_output(last_mog, deterministic=False) self.Us = precision_out['Us'] self.ldetUs = precision_out['ldetUs'] self.comps = { **{'a': self.a}, **{'m' + str(i): self.ms[i] for i in range(self.n_components)}, **{'U' + str(i): self.Us[i] for i in range(self.n_components)}} # log probability of y given the mixture distribution # lprobs_comps : log probs per component, list of len n_components with (batch, ) # probs : log probs of mixture, (batch, ) self.lprobs_comps = [-0.5 * tt.sum(tt.sum((self.params - m).dimshuffle( [0, 'x', 1]) * U, axis=2)**2, axis=1) + ldetU for m, U, ldetU in zip(self.ms, self.Us, self.ldetUs)] self.lprobs = (MyLogSumExp(tt.stack(self.lprobs_comps, axis=1) + tt.log(self.a), axis=1) - (0.5 * self.n_outputs * np.log(2 * np.pi))).squeeze() # the quantities from above again, but with deterministic=True # --- in the svi case, this will disable injection of randomness; # the mean of weights is used instead self.da, self.dms, dprecision_out = ll.get_output(last_mog, deterministic=True) self.dUs = dprecision_out['Us'] self.dldetUs = dprecision_out['ldetUs'] self.dcomps = { **{'a': self.da}, **{'m' + str(i): self.dms[i] for i in range(self.n_components)}, **{'U' + str(i): self.dUs[i] for i in range(self.n_components)}} self.dlprobs_comps = [-0.5 * tt.sum(tt.sum((self.params - m).dimshuffle( [0, 'x', 1]) * U, axis=2)**2, axis=1) + ldetU for m, U, ldetU in zip(self.dms, self.dUs, self.dldetUs)] self.dlprobs = (MyLogSumExp(tt.stack(self.dlprobs_comps, axis=1) + tt.log(self.da), axis=1) \ - (0.5 * self.n_outputs * np.log(2 * np.pi))).squeeze() # parameters of network self.aps = ll.get_all_params(last_mog) # all parameters self.mps = ll.get_all_params(last_mog, mp=True) # means self.sps = ll.get_all_params(last_mog, sp=True) # log stds # weight and bias parameter sets as separate lists self.mps_wp = ll.get_all_params(last_mog, mp=True, wp=True) self.sps_wp = ll.get_all_params(last_mog, sp=True, wp=True) self.mps_bp = ll.get_all_params(last_mog, mp=True, bp=True) self.sps_bp = ll.get_all_params(last_mog, sp=True, bp=True)
def build_model(self, train_set, test_set, validation_set=None): super(CNN, self).build_model(train_set, test_set, validation_set) epsilon = 1e-8 y_train = T.clip(get_output(self.model, self.sym_x), epsilon, 1) loss_cc = aggregate(categorical_crossentropy(y_train, self.sym_t), mode='mean') loss_train_acc = categorical_accuracy(y_train, self.sym_t).mean() y = T.clip(get_output(self.model, self.sym_x, deterministic=True), epsilon, 1) loss_eval = aggregate(categorical_crossentropy(y, self.sym_t), mode='mean') loss_acc = categorical_accuracy(y, self.sym_t).mean() all_params = get_all_params(self.model, trainable=True) sym_beta1 = T.scalar('beta1') sym_beta2 = T.scalar('beta2') grads = T.grad(loss_cc, all_params) grads = [T.clip(g, -5, 5) for g in grads] updates = rmsprop(grads, all_params, self.sym_lr, sym_beta1, sym_beta2) inputs = [ self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1, sym_beta2 ] f_train = theano.function( inputs, [loss_cc, loss_train_acc], updates=updates, givens={ self.sym_x: self.sh_train_x[self.batch_slice], self.sym_t: self.sh_train_t[self.batch_slice], }, ) f_test = theano.function( [self.sym_index, self.sym_batchsize], [loss_eval, loss_acc], givens={ self.sym_x: self.sh_test_x[self.batch_slice], self.sym_t: self.sh_test_t[self.batch_slice], }, ) f_validate = None if validation_set is not None: f_validate = theano.function( [self.sym_index, self.sym_batchsize], [loss_eval, loss_acc], givens={ self.sym_x: self.sh_valid_x[self.batch_slice], self.sym_t: self.sh_valid_t[self.batch_slice], }, ) self.train_args['inputs']['batchsize'] = 128 self.train_args['inputs']['learningrate'] = 1e-3 self.train_args['inputs']['beta1'] = 0.9 self.train_args['inputs']['beta2'] = 0.999 self.train_args['outputs']['loss_cc'] = '%0.6f' self.train_args['outputs']['loss_train_acc'] = '%0.6f' self.test_args['inputs']['batchsize'] = 128 self.test_args['outputs']['loss_eval'] = '%0.6f' self.test_args['outputs']['loss_acc'] = '%0.6f' self.validate_args['inputs']['batchsize'] = 128 # self.validate_args['outputs']['loss_eval'] = '%0.6f' # self.validate_args['outputs']['loss_acc'] = '%0.6f' return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
def __init__( self, disc_window, disc_joints_dim, iteration, a_max=0.7, a_min=0.0, batch_size = 64, iter_per_train = 10, decent_portion=0.8, hidden_sizes=(32, 32), hidden_nonlinearity=NL.tanh, output_nonlinearity=NL.tanh, disc_network=None, ): self.batch_size=64 self.iter_per_train=10 self.disc_window = disc_window self.disc_joints_dim = disc_joints_dim self.disc_dim = self.disc_window*self.disc_joints_dim self.end_iter = int(iteration*decent_portion) self.iter_count = 0 out_dim = 1 target_var = TT.ivector('targets') # create network if disc_network is None: disc_network = MLP( input_shape=(self.disc_dim,), output_dim=out_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, output_nonlinearity=output_nonlinearity, ) self._disc_network = disc_network disc_reward = disc_network.output_layer obs_var = disc_network.input_layer.input_var disc_var, = L.get_output([disc_reward]) self._disc_var = disc_var LasagnePowered.__init__(self, [disc_reward]) self._f_disc = ext.compile_function( inputs=[obs_var], outputs=[disc_var], log_name="f_discriminate_forward", ) params = L.get_all_params(disc_network, trainable=True) loss = lasagne.objectives.categorical_crossentropy(disc_var, target_var).mean() updates = lasagne.updates.adam(loss, params, learning_rate=0.01) self._f_disc_train = ext.compile_function( inputs=[obs_var, target_var], outputs=[loss], updates=updates, log_name="f_discriminate_train" ) self.data = self.load_data() self.a = np.linspace(a_min, a_max, self.end_iter)
def __init__(self, K, vocab_size, num_chars, W_init, regularizer, rlambda, nhidden, embed_dim, dropout, train_emb, subsample, char_dim, use_feat): self.nhidden = nhidden self.embed_dim = embed_dim self.dropout = dropout self.train_emb = train_emb self.subsample = subsample self.char_dim = char_dim self.learning_rate = LEARNING_RATE self.num_chars = num_chars self.use_feat = use_feat norm = lasagne.regularization.l2 if regularizer == 'l2' else lasagne.regularization.l1 self.use_chars = self.char_dim != 0 if W_init is None: W_init = lasagne.init.GlorotNormal().sample( (vocab_size, self.embed_dim)) doc_var, query_var, cand_var = T.itensor3('doc'), T.itensor3('quer'), \ T.wtensor3('cand') docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \ T.bmatrix('c_mask') target_var = T.ivector('ans') feat_var = T.imatrix('feat') doc_toks, qry_toks = T.imatrix('dchars'), T.imatrix('qchars') tok_var, tok_mask = T.imatrix('tok'), T.bmatrix('tok_mask') cloze_var = T.ivector('cloze') self.inps = [ doc_var, doc_toks, query_var, qry_toks, cand_var, target_var, docmask_var, qmask_var, tok_var, tok_mask, candmask_var, feat_var, cloze_var ] if rlambda > 0.: W_pert = W_init + lasagne.init.GlorotNormal().sample(W_init.shape) else: W_pert = W_init self.predicted_probs, predicted_probs_val, self.doc_net, self.q_net, W_emb = ( self.build_network(K, vocab_size, W_pert)) self.loss_fn = T.nnet.categorical_crossentropy(self.predicted_probs, target_var).mean() + \ rlambda*norm(W_emb-W_init) self.eval_fn = lasagne.objectives.categorical_accuracy( self.predicted_probs, target_var).mean() loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val, target_var).mean() + \ rlambda*norm(W_emb-W_init) eval_fn_val = lasagne.objectives.categorical_accuracy( predicted_probs_val, target_var).mean() self.params = L.get_all_params([self.doc_net] + self.q_net, trainable=True) updates = lasagne.updates.adam(self.loss_fn, self.params, learning_rate=self.learning_rate) self.train_fn = theano.function( self.inps, [self.loss_fn, self.eval_fn, self.predicted_probs], updates=updates, on_unused_input='warn') self.validate_fn = theano.function( self.inps, [loss_fn_val, eval_fn_val, predicted_probs_val], on_unused_input='warn')
def build_network_from_ae(classn): input_var = T.tensor4('input_var') layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var) layer = batch_norm( layers.Conv2DLayer(layer, 100, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 120, filter_size=(5, 5), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=2, mode='average_inc_pad') layer = batch_norm( layers.Conv2DLayer(layer, 240, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Conv2DLayer(layer, 320, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) layer = layers.Pool2DLayer(layer, pool_size=(2, 2), stride=2, mode='average_inc_pad') layer = batch_norm( layers.Conv2DLayer(layer, 640, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) prely = batch_norm( layers.Conv2DLayer(layer, 1024, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) featm = batch_norm( layers.Conv2DLayer(prely, 640, filter_size=(1, 1), nonlinearity=leaky_rectify)) feat_map = batch_norm( layers.Conv2DLayer(featm, 100, filter_size=(1, 1), nonlinearity=rectify, name="feat_map")) maskm = batch_norm( layers.Conv2DLayer(prely, 100, filter_size=(1, 1), nonlinearity=leaky_rectify)) mask_rep = batch_norm(layers.Conv2DLayer(maskm, 1, filter_size=(1, 1), nonlinearity=None), beta=None, gamma=None) mask_map = SoftThresPerc(mask_rep, perc=0.0, alpha=0.1, beta=init.Constant(0.5), tight=100.0, bias=-10, name="mask_map") enlyr = ChInnerProdMerge(feat_map, mask_map, name="encoder") layer = batch_norm( layers.Deconv2DLayer(enlyr, 1024, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 640, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 640, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 320, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 320, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 240, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 120, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = batch_norm( layers.Deconv2DLayer(layer, 100, filter_size=(5, 5), stride=1, crop='same', nonlinearity=leaky_rectify)) layer = layers.Deconv2DLayer(layer, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) glblf = batch_norm( layers.Conv2DLayer(prely, 128, filter_size=(1, 1), nonlinearity=leaky_rectify)) glblf = layers.Pool2DLayer(glblf, pool_size=(5, 5), stride=5, mode='average_inc_pad') glblf = batch_norm( layers.Conv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, pad='same', nonlinearity=leaky_rectify)) gllyr = batch_norm(layers.Conv2DLayer(glblf, 5, filter_size=(1, 1), nonlinearity=rectify), name="global_feature") glblf = batch_norm( layers.Deconv2DLayer(gllyr, 256, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(9, 9), stride=5, crop=(2, 2), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 128, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 64, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(4, 4), stride=2, crop=(1, 1), nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = batch_norm( layers.Deconv2DLayer(glblf, 32, filter_size=(3, 3), stride=1, crop='same', nonlinearity=leaky_rectify)) glblf = layers.Deconv2DLayer(glblf, 3, filter_size=(1, 1), stride=1, crop='same', nonlinearity=identity) layer = layers.ElemwiseSumLayer([layer, glblf]) network = ReshapeLayer(layer, ([0], -1)) layers.set_all_param_values(network, pickle.load(open(filename_model_ae, 'rb'))) mask_map.beta.set_value(np.float32(-10.0 * mask_map.beta.get_value())) old_params = layers.get_all_params(network, trainable=True) # Adding more layers aug_var = T.matrix('aug_var') target_var = T.imatrix('targets') add_a = batch_norm( layers.Conv2DLayer(enlyr, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_b = batch_norm( layers.Conv2DLayer(add_a, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_c = batch_norm( layers.Conv2DLayer(add_b, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_d = batch_norm( layers.Conv2DLayer(add_c, 320, filter_size=(1, 1), nonlinearity=leaky_rectify)) add_0 = layers.Pool2DLayer(add_d, pool_size=(15, 15), stride=15, mode='average_inc_pad') add_1 = batch_norm( layers.DenseLayer(add_0, 100, nonlinearity=leaky_rectify)) add_2 = batch_norm( layers.DenseLayer(gllyr, 320, nonlinearity=leaky_rectify)) add_3 = batch_norm( layers.DenseLayer(add_2, 320, nonlinearity=leaky_rectify)) add_4 = batch_norm( layers.DenseLayer(add_3, 100, nonlinearity=leaky_rectify)) aug_layer = layers.InputLayer(shape=(None, aug_fea_n), input_var=aug_var) cat_layer = lasagne.layers.ConcatLayer([add_1, add_4, aug_layer], axis=1) hidden_layer = layers.DenseLayer(cat_layer, 80, nonlinearity=leaky_rectify) network = layers.DenseLayer(hidden_layer, classn, nonlinearity=sigmoid) all_params = layers.get_all_params(network, trainable=True) new_params = [x for x in all_params if x not in old_params] return network, new_params, input_var, aug_var, target_var
def get_all_params(self): return L.get_all_params(self.network, trainable=True)
return XX, XY, label, dx, dy frame, targets = T.tensor4(), T.tensor4() net = ll.InputLayer((None,2,100,100),input_var=frame) net = ll.Conv2DLayer(net,32,(5,5),b=None,pad='same') net = ll.Pool2DLayer(net,(2,2), mode='average_inc_pad') net = ll.Conv2DLayer(net,8,(3,3),b=None,pad='same',nonlinearity=l.nonlinearities.LeakyRectify(0.1)) net = ll.Pool2DLayer(net,(2,2), mode='average_inc_pad') net = ll.DenseLayer(net,625,b=None,nonlinearity=None) net = ll.ReshapeLayer(net,([0],1,25,25)) predict = ll.get_output(net) targets_pool = pool_2d(targets, ds=(4,4), mode='average_inc_pad') loss = T.mean((predict-targets_pool)**2) params = ll.get_all_params(net,trainable=True) updates = l.updates.adam(loss,params,0.01) train_f = theano.function([frame,targets],[loss,predict],updates=updates) data = premnist() errlist = [] for i in range(6000): x, y, move, label = mnist_data(data,(32,1,100,100),noise=None,heatmap=True,down=1) xx, xy = fftprocess(x,y) err, result = train_f(np.concatenate((xx,xy),axis=1),label) errlist.append(err) if (i+1)%10==0: print i+1,err np.savez('toymodel.npz',*ll.get_all_param_values(net))
target_data = T.imatrix('target_data') target_mask = T.fmatrix('target_mask') network_output, cond_layer_list = deep_projection_cond_ln_model( input_var=input_data, mask_var=input_mask, num_inputs=input_dim, num_outputs=output_dim, num_layers=args.num_layers, num_conds=args.num_conds, num_factors=args.num_factors, num_units=args.num_units, grad_clipping=args.grad_clipping, dropout=args.dropout) network = network_output network_params = get_all_params(network, trainable=True) param_count = count_params(network, trainable=True) print('Number of parameters of the network: {:.2f}M'.format( float(param_count) / 1000000)) ###################### # reload model param # ###################### if args.reload_model: print('Loading model: {}'.format(args.reload_model)) with open(args.reload_model, 'rb') as f: [ pretrain_network_params_val, pretrain_update_params_val, pretrain_total_epoch_cnt ] = pickle.load(f) set_model_param_value(network_params, pretrain_network_params_val)
def __init__(self, n_in, n_filters, filter_sizes, n_out, pool_sizes=None, n_hidden=(512), ccf=False, trans_func=rectify, out_func=softmax, dense_dropout=0.0, stats=2, input_noise=0.0, batch_norm=False, conv_dropout=0.0): super(CNN, self).__init__(n_in, n_hidden, n_out, trans_func) self.outf = out_func self.log = "" # Define model using lasagne framework dropout = True if not dense_dropout == 0.0 else False # Overwrite input layer sequence_length, n_features = n_in self.l_in = InputLayer(shape=(None, sequence_length, n_features)) l_prev = self.l_in # Separate into raw values and statistics sequence_length -= stats stats_layer = SliceLayer(l_prev, indices=slice(sequence_length, None), axis=1) stats_layer = ReshapeLayer(stats_layer, (-1, stats * n_features)) print('Stats layer shape', stats_layer.output_shape) l_prev = SliceLayer(l_prev, indices=slice(0, sequence_length), axis=1) print('Conv input layer shape', l_prev.output_shape) # Apply input noise l_prev = GaussianNoiseLayer(l_prev, sigma=input_noise) if ccf: self.log += "\nAdding cross-channel feature layer" l_prev = ReshapeLayer(l_prev, (-1, 1, sequence_length, n_features)) l_prev = Conv2DLayer(l_prev, num_filters=4 * n_features, filter_size=(1, n_features), nonlinearity=None) n_features *= 4 if batch_norm: l_prev = batch_norm_layer(l_prev) l_prev = ReshapeLayer(l_prev, (-1, n_features, sequence_length)) l_prev = DimshuffleLayer(l_prev, (0, 2, 1)) # 2D Convolutional layers l_prev = ReshapeLayer(l_prev, (-1, 1, sequence_length, n_features)) l_prev = DimshuffleLayer(l_prev, (0, 3, 2, 1)) # Add the convolutional filters for n_filter, filter_size, pool_size in zip(n_filters, filter_sizes, pool_sizes): self.log += "\nAdding 2D conv layer: %d x %d" % (n_filter, filter_size) l_prev = Conv2DLayer(l_prev, num_filters=n_filter, filter_size=(filter_size, 1), nonlinearity=self.transf, pad=filter_size // 2) if batch_norm: l_prev = batch_norm_layer(l_prev) if pool_size > 1: self.log += "\nAdding max pooling layer: %d" % pool_size l_prev = Pool2DLayer(l_prev, pool_size=(pool_size, 1)) self.log += "\nAdding dropout layer: %.2f" % conv_dropout l_prev = TiedDropoutLayer(l_prev, p=conv_dropout) print("Conv out shape", get_output_shape(l_prev)) # Global pooling layer l_prev = GlobalPoolLayer(l_prev, pool_function=T.mean, name='Global Mean Pool') print("GlobalPoolLayer out shape", get_output_shape(l_prev)) # Concatenate stats l_prev = ConcatLayer((l_prev, stats_layer), axis=1) for n_hid in n_hidden: self.log += "\nAdding dense layer with %d units" % n_hid print("Dense input shape", get_output_shape(l_prev)) l_prev = DenseLayer(l_prev, n_hid, init.GlorotNormal(), init.Normal(1e-3), self.transf) if batch_norm: l_prev = batch_norm_layer(l_prev) if dropout: self.log += "\nAdding dense dropout with probability: %.2f" % dense_dropout l_prev = DropoutLayer(l_prev, p=dense_dropout) if batch_norm: self.log += "\nUsing batch normalization" self.model = DenseLayer(l_prev, num_units=n_out, nonlinearity=out_func) self.model_params = get_all_params(self.model) self.sym_x = T.tensor3('x') self.sym_t = T.matrix('t')
def multi_task_classifier(args, input_var, target_var, wordEmbeddings, seqlen, num_feats, lambda_val=0.5 * 1e-4): print("Building multi task model with 1D Convolution") vocab_size = wordEmbeddings.shape[1] wordDim = wordEmbeddings.shape[0] kw = 2 num_filters = seqlen - kw + 1 stride = 1 filter_size = wordDim pool_size = num_filters input = InputLayer((None, seqlen, num_feats), input_var=input_var) batchsize, _, _ = input.input_var.shape #span emb1 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape1 = ReshapeLayer(emb1, (batchsize, seqlen, num_feats * wordDim)) conv1d_1 = DimshuffleLayer( Conv1DLayer(reshape1, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_1 = MaxPool1DLayer(conv1d_1, pool_size=pool_size) hid_1 = DenseLayer(maxpool_1, num_units=args.hiddenDim, nonlinearity=sigmoid) network_1 = DenseLayer(hid_1, num_units=2, nonlinearity=softmax) """ #DocTimeRel emb2 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape2 = ReshapeLayer(emb2, (batchsize, seqlen, num_feats*wordDim)) conv1d_2 = DimshuffleLayer(Conv1DLayer(reshape2, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_2 = MaxPool1DLayer(conv1d_2, pool_size=pool_size) hid_2 = DenseLayer(maxpool_2, num_units=args.hiddenDim, nonlinearity=sigmoid) network_2 = DenseLayer(hid_2, num_units=5, nonlinearity=softmax) """ #Type emb3 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape3 = ReshapeLayer(emb3, (batchsize, seqlen, num_feats * wordDim)) conv1d_3 = DimshuffleLayer( Conv1DLayer(reshape3, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_3 = MaxPool1DLayer(conv1d_3, pool_size=pool_size) hid_3 = DenseLayer(maxpool_3, num_units=args.hiddenDim, nonlinearity=sigmoid) network_3 = DenseLayer(hid_3, num_units=4, nonlinearity=softmax) #Degree emb4 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape4 = ReshapeLayer(emb4, (batchsize, seqlen, num_feats * wordDim)) conv1d_4 = DimshuffleLayer( Conv1DLayer(reshape4, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_4 = MaxPool1DLayer(conv1d_4, pool_size=pool_size) hid_4 = DenseLayer(maxpool_4, num_units=args.hiddenDim, nonlinearity=sigmoid) network_4 = DenseLayer(hid_4, num_units=4, nonlinearity=softmax) #Polarity emb5 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape5 = ReshapeLayer(emb5, (batchsize, seqlen, num_feats * wordDim)) conv1d_5 = DimshuffleLayer( Conv1DLayer(reshape5, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_5 = MaxPool1DLayer(conv1d_5, pool_size=pool_size) hid_5 = DenseLayer(maxpool_5, num_units=args.hiddenDim, nonlinearity=sigmoid) network_5 = DenseLayer(hid_5, num_units=3, nonlinearity=softmax) #ContextualModality emb6 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape6 = ReshapeLayer(emb6, (batchsize, seqlen, num_feats * wordDim)) conv1d_6 = DimshuffleLayer( Conv1DLayer(reshape6, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh, W=GlorotUniform()), (0, 2, 1)) maxpool_6 = MaxPool1DLayer(conv1d_6, pool_size=pool_size) hid_6 = DenseLayer(maxpool_6, num_units=args.hiddenDim, nonlinearity=sigmoid) network_6 = DenseLayer(hid_6, num_units=5, nonlinearity=softmax) """ #ContextualAspect emb7 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape7 = ReshapeLayer(emb7, (batchsize, seqlen, num_feats*wordDim)) conv1d_7 = DimshuffleLayer(Conv1DLayer(reshape7, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_7 = MaxPool1DLayer(conv1d_7, pool_size=pool_size) hid_7 = DenseLayer(maxpool_7, num_units=args.hiddenDim, nonlinearity=sigmoid) network_7 = DenseLayer(hid_7, num_units=4, nonlinearity=softmax) """ """ #Permanence emb8 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T) reshape8 = ReshapeLayer(emb8, (batchsize, seqlen, num_feats*wordDim)) conv1d_8 = DimshuffleLayer(Conv1DLayer(reshape8, num_filters=num_filters, filter_size=wordDim, stride=1, nonlinearity=tanh,W=GlorotUniform()), (0,2,1)) maxpool_8 = MaxPool1DLayer(conv1d_8, pool_size=pool_size) hid_8 = DenseLayer(maxpool_8, num_units=args.hiddenDim, nonlinearity=sigmoid) network_8 = DenseLayer(hid_8, num_units=4, nonlinearity=softmax) """ # Is this important? """ network_1_out, network_2_out, network_3_out, network_4_out, \ network_5_out, network_6_out, network_7_out, network_8_out = \ get_output([network_1, network_2, network_3, network_4, network_5, network_6, network_7, network_8]) """ network_1_out = get_output(network_1) network_3_out = get_output(network_3) network_4_out = get_output(network_4) network_5_out = get_output(network_5) network_6_out = get_output(network_6) loss_1 = T.mean(binary_crossentropy( network_1_out, target_var)) + regularize_layer_params_weighted( { emb1: lambda_val, conv1d_1: lambda_val, hid_1: lambda_val, network_1: lambda_val }, l2) updates_1 = adagrad(loss_1, get_all_params(network_1, trainable=True), learning_rate=args.step) train_fn_1 = theano.function([input_var, target_var], loss_1, updates=updates_1, allow_input_downcast=True) val_acc_1 = T.mean( binary_accuracy(get_output(network_1, deterministic=True), target_var)) val_fn_1 = theano.function([input_var, target_var], val_acc_1, allow_input_downcast=True) """ loss_2 = T.mean(categorical_crossentropy(network_2_out,target_var)) + regularize_layer_params_weighted({emb2:lambda_val, conv1d_2:lambda_val, hid_2:lambda_val, network_2:lambda_val} , l2) updates_2 = adagrad(loss_2, get_all_params(network_2, trainable=True), learning_rate=args.step) train_fn_2 = theano.function([input_var, target_var], loss_2, updates=updates_2, allow_input_downcast=True) val_acc_2 = T.mean(categorical_accuracy(get_output(network_2, deterministic=True), target_var)) val_fn_2 = theano.function([input_var, target_var], val_acc_2, allow_input_downcast=True) """ loss_3 = T.mean(categorical_crossentropy( network_3_out, target_var)) + regularize_layer_params_weighted( { emb3: lambda_val, conv1d_3: lambda_val, hid_3: lambda_val, network_3: lambda_val }, l2) updates_3 = adagrad(loss_3, get_all_params(network_3, trainable=True), learning_rate=args.step) train_fn_3 = theano.function([input_var, target_var], loss_3, updates=updates_3, allow_input_downcast=True) val_acc_3 = T.mean( categorical_accuracy(get_output(network_3, deterministic=True), target_var)) val_fn_3 = theano.function([input_var, target_var], val_acc_3, allow_input_downcast=True) loss_4 = T.mean(categorical_crossentropy( network_4_out, target_var)) + regularize_layer_params_weighted( { emb4: lambda_val, conv1d_4: lambda_val, hid_4: lambda_val, network_4: lambda_val }, l2) updates_4 = adagrad(loss_4, get_all_params(network_4, trainable=True), learning_rate=args.step) train_fn_4 = theano.function([input_var, target_var], loss_4, updates=updates_4, allow_input_downcast=True) val_acc_4 = T.mean( categorical_accuracy(get_output(network_4, deterministic=True), target_var)) val_fn_4 = theano.function([input_var, target_var], val_acc_4, allow_input_downcast=True) loss_5 = T.mean(categorical_crossentropy( network_5_out, target_var)) + regularize_layer_params_weighted( { emb5: lambda_val, conv1d_5: lambda_val, hid_5: lambda_val, network_5: lambda_val }, l2) updates_5 = adagrad(loss_5, get_all_params(network_5, trainable=True), learning_rate=args.step) train_fn_5 = theano.function([input_var, target_var], loss_5, updates=updates_5, allow_input_downcast=True) val_acc_5 = T.mean( categorical_accuracy(get_output(network_5, deterministic=True), target_var)) val_fn_5 = theano.function([input_var, target_var], val_acc_5, allow_input_downcast=True) loss_6 = T.mean(categorical_crossentropy( network_6_out, target_var)) + regularize_layer_params_weighted( { emb6: lambda_val, conv1d_6: lambda_val, hid_6: lambda_val, network_6: lambda_val }, l2) updates_6 = adagrad(loss_6, get_all_params(network_6, trainable=True), learning_rate=args.step) train_fn_6 = theano.function([input_var, target_var], loss_6, updates=updates_6, allow_input_downcast=True) val_acc_6 = T.mean( categorical_accuracy(get_output(network_6, deterministic=True), target_var)) val_fn_6 = theano.function([input_var, target_var], val_acc_6, allow_input_downcast=True) """ loss_7 = T.mean(categorical_crossentropy(network_7_out,target_var)) + regularize_layer_params_weighted({emb7:lambda_val, conv1d_7:lambda_val, hid_7:lambda_val, network_7:lambda_val} , l2) updates_7 = adagrad(loss_7, get_all_params(network_7, trainable=True), learning_rate=args.step) train_fn_7 = theano.function([input_var, target_var], loss_7, updates=updates_7, allow_input_downcast=True) val_acc_7 = T.mean(categorical_accuracy(get_output(network_7, deterministic=True), target_var)) val_fn_7 = theano.function([input_var, target_var], val_acc_7, allow_input_downcast=True) loss_8 = T.mean(categorical_crossentropy(network_8_out,target_var)) + regularize_layer_params_weighted({emb8:lambda_val, conv1d_8:lambda_val, hid_8:lambda_val, network_8:lambda_val} , l2) updates_8 = adagrad(loss_8, get_all_params(network_8, trainable=True), learning_rate=args.step) train_fn_8 = theano.function([input_var, target_var], loss_8, updates=updates_8, allow_input_downcast=True) val_acc_8 = T.mean(categorical_accuracy(get_output(network_8, deterministic=True), target_var)) val_fn_8 = theano.function([input_var, target_var], val_acc_8, allow_input_downcast=True) """ """ return train_fn_1, val_fn_1, network_1, train_fn_2, val_fn_2, network_2, train_fn_3, val_fn_3, \ network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \ train_fn_6, val_fn_6, network_6, train_fn_7, val_fn_7, network_7, train_fn_8, val_fn_8, network_8 """ return train_fn_1, val_fn_1, network_1, train_fn_3, val_fn_3, \ network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \ train_fn_6, val_fn_6, network_6
def get_params(self, values=True): if values: return get_all_param_values(self.net['conv5_1']) return get_all_params(self.net['conv5_1'], trainable=True)
axis=1) z = nn.log_sum_exp(z, axis=1) return n_plus, n_minus, z if l_type == 'L2': n_plus = T.sum((a_lab - b_lab)**2, axis=1) n_minus = T.sum((a_lab - c_lab)**2, axis=1) dist = n_plus - n_minus + 10.0 loss_lab = T.mean(dist * T.gt(dist, 0.0)) else: n_plus_lab, n_minus_lab, z_lab = loss_labeled(a_lab, b_lab, c_lab) loss_lab = -T.mean(n_minus_lab) + T.mean(z_lab) lr = T.scalar() disc_params = LL.get_all_params(layers, trainable=True) disc_param_updates = nn.adam_updates(disc_params, loss_lab, lr=lr, mom1=0.5) disc_param_avg = [ th.shared(np.cast[th.config.floatX](0. * p.get_value())) for p in disc_params ] disc_avg_updates = [(a, a + 0.0001 * (p - a)) for p, a in zip(disc_params, disc_param_avg)] disc_avg_givens = [(p, a) for p, a in zip(disc_params, disc_param_avg)] train_batch_disc = th.function(inputs=[x_lab, lr], outputs=loss_lab, updates=disc_param_updates + disc_avg_updates) nr_batches_train = int(trainx.shape[0] / batch_size)
def main(): setup_train_experiment(logger, FLAGS, "%(model)s_at") logger.info("Loading data...") data = mnist_load(FLAGS.train_size, FLAGS.seed) X_train, y_train = data.X_train, data.y_train X_val, y_val = data.X_val, data.y_val X_test, y_test = data.X_test, data.y_test img_shape = [None, 1, 28, 28] train_images = T.tensor4('train_images') train_labels = T.lvector('train_labels') val_images = T.tensor4('valid_labels') val_labels = T.lvector('valid_labels') layer_dims = [int(dim) for dim in FLAGS.layer_dims.split("-")] num_classes = layer_dims[-1] net = create_network(FLAGS.model, img_shape, layer_dims=layer_dims) model = with_end_points(net) train_outputs = model(train_images) val_outputs = model(val_images, deterministic=True) # losses train_ce = categorical_crossentropy(train_outputs['prob'], train_labels).mean() train_at = adversarial_training(lambda x: model(x)['prob'], train_images, train_labels, epsilon=FLAGS.epsilon).mean() train_loss = train_ce + FLAGS.lmbd * train_at val_ce = categorical_crossentropy(val_outputs['prob'], val_labels).mean() val_deepfool_images = deepfool( lambda x: model(x, deterministic=True)['logits'], val_images, val_labels, num_classes, max_iter=FLAGS.deepfool_iter, clip_dist=FLAGS.deepfool_clip, over_shoot=FLAGS.deepfool_overshoot) # metrics train_acc = categorical_accuracy(train_outputs['logits'], train_labels).mean() train_err = 1.0 - train_acc val_acc = categorical_accuracy(val_outputs['logits'], val_labels).mean() val_err = 1.0 - val_acc # deepfool robustness reduc_ind = range(1, train_images.ndim) l2_deepfool = (val_deepfool_images - val_images).norm(2, axis=reduc_ind) l2_deepfool_norm = l2_deepfool / val_images.norm(2, axis=reduc_ind) train_metrics = OrderedDict([('loss', train_loss), ('nll', train_ce), ('at', train_at), ('err', train_err)]) val_metrics = OrderedDict([('nll', val_ce), ('err', val_err)]) summary_metrics = OrderedDict([('l2', l2_deepfool.mean()), ('l2_norm', l2_deepfool_norm.mean())]) lr = theano.shared(floatX(FLAGS.initial_learning_rate), 'learning_rate') train_params = get_all_params(net, trainable=True) train_updates = adam(train_loss, train_params, lr) logger.info("Compiling theano functions...") train_fn = theano.function([train_images, train_labels], outputs=train_metrics.values(), updates=train_updates) val_fn = theano.function([val_images, val_labels], outputs=val_metrics.values()) summary_fn = theano.function([val_images, val_labels], outputs=summary_metrics.values() + [val_deepfool_images]) logger.info("Starting training...") try: samples_per_class = FLAGS.summary_samples_per_class summary_images, summary_labels = select_balanced_subset( X_val, y_val, num_classes, samples_per_class) save_path = os.path.join(FLAGS.samples_dir, 'orig.png') save_images(summary_images, save_path) epoch = 0 batch_index = 0 while epoch < FLAGS.num_epochs: epoch += 1 start_time = time.time() train_iterator = batch_iterator(X_train, y_train, FLAGS.batch_size, shuffle=True) epoch_outputs = np.zeros(len(train_fn.outputs)) for batch_index, (images, labels) in enumerate(train_iterator, batch_index + 1): batch_outputs = train_fn(images, labels) epoch_outputs += batch_outputs epoch_outputs /= X_train.shape[0] // FLAGS.batch_size logger.info( build_result_str( "Train epoch [{}, {:.2f}s]:".format( epoch, time.time() - start_time), train_metrics.keys(), epoch_outputs)) # update learning rate if epoch > FLAGS.start_learning_rate_decay: new_lr_value = lr.get_value( ) * FLAGS.learning_rate_decay_factor lr.set_value(floatX(new_lr_value)) logger.debug("learning rate was changed to {:.10f}".format( new_lr_value)) # validation start_time = time.time() val_iterator = batch_iterator(X_val, y_val, FLAGS.test_batch_size, shuffle=False) val_epoch_outputs = np.zeros(len(val_fn.outputs)) for images, labels in val_iterator: val_epoch_outputs += val_fn(images, labels) val_epoch_outputs /= X_val.shape[0] // FLAGS.test_batch_size logger.info( build_result_str( "Test epoch [{}, {:.2f}s]:".format( epoch, time.time() - start_time), val_metrics.keys(), val_epoch_outputs)) if epoch % FLAGS.summary_frequency == 0: summary = summary_fn(summary_images, summary_labels) logger.info( build_result_str( "Epoch [{}] adversarial statistics:".format(epoch), summary_metrics.keys(), summary[:-1])) save_path = os.path.join(FLAGS.samples_dir, 'epoch-%d.png' % epoch) df_images = summary[-1] save_images(df_images, save_path) if epoch % FLAGS.checkpoint_frequency == 0: save_network(net, epoch=epoch) except KeyboardInterrupt: logger.debug("Keyboard interrupt. Stopping training...") finally: save_network(net) # evaluate final model on test set test_iterator = batch_iterator(X_test, y_test, FLAGS.test_batch_size, shuffle=False) test_results = np.zeros(len(val_fn.outputs)) for images, labels in test_iterator: test_results += val_fn(images, labels) test_results /= X_test.shape[0] // FLAGS.test_batch_size logger.info( build_result_str("Final test results:", val_metrics.keys(), test_results))
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, l2, mode, rnn_num_units, batch_norm, **kwargs): print "==> not used params in DMN class:", kwargs.keys() self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.l2 = l2 self.mode = mode self.num_units = rnn_num_units self.batch_norm = batch_norm self.input_var = T.tensor3('input_var') self.answer_var = T.ivector('answer_var') # scale inputs to be in [-1, 1] input_var_norm = 2 * self.input_var - 1 print "==> building network" example = np.random.uniform(size=(self.batch_size, 858, 256), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size, )) ######### # InputLayer network = layers.InputLayer(shape=(None, 858, 256), input_var=input_var_norm) print layers.get_output(network).eval({self.input_var: example}).shape # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units) print layers.get_output(network).eval({self.input_var: example}).shape # BatchNormalization Layer if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) print layers.get_output(network).eval({ self.input_var: example }).shape # GRULayer network = layers.GRULayer(incoming=network, num_units=self.num_units, only_return_final=True) print layers.get_output(network).eval({self.input_var: example}).shape # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print layers.get_output(network).eval({self.input_var: example}).shape self.params = layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) self.loss_ce = lasagne.objectives.categorical_crossentropy( self.prediction, self.answer_var).mean() if (self.l2 > 0): self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params( network, lasagne.regularization.l2) else: self.loss_l2 = 0 self.loss = self.loss_ce + self.loss_l2 #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) if self.mode == 'train': print "==> compiling train_fn" self.train_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print "==> compiling test_fn" self.test_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
def __init__(self, retina_model, seeder_model, n_seeds, n_steps, n_units=100, normalization_coefs=None, loss_coefs=None, alpha=1.0, threshold=1.0): self.seeder_model = seeder_model self.n_seeds = n_seeds self.n_steps = n_steps self.threshold = threshold self.retina = retina_model event_shareds = retina_model.get_event_variables() self.seeder = self.seeder_model(retina_model) if normalization_coefs is None: normalization_coefs = np.ones(shape=retina_model.model_nparams, dtype='float32') else: normalization_coefs = np.array(normalization_coefs, dtype='float32') ### params + sigma self.inputs = retina_model.alloc_model_params() self.input_layer, self.out_layer, self.reg = self.build_nn( retina_model.model_nparams, n_units=n_units) print 'Linking to Retina Model' iterations = [self.inputs] responses = [] for i in xrange(self.n_steps): print 'Iteration %d' % i prev = iterations[i] r, grads = retina_model.grad_for(*event_shareds + prev) normed_params = [p * c for p, c in zip(prev, normalization_coefs)] normed_grads = [g * c for g, c in zip(grads, normalization_coefs)] out = self.get_update_for(normed_params, r, normed_grads) param_updates = [out[:, i] for i in range(len(self.inputs))] track_param_updates, sigma_update = param_updates[: -1], param_updates[ -1] ### sigma (last parameter) is updated simply by replacing ### previous variable update = [ var + upd * alpha for var, upd in zip(prev[:-1], track_param_updates) ] + [T.exp(-sigma_update)] for var, upd, new in zip(prev[:-1], track_param_updates, update): print ' -', new, '=', var, '+ %.2e' % alpha, upd iterations.append(update) responses.append(r) prediction = iterations[-1] sigma_train = T.fscalar('sigma_train') ### Except sigma self.true_parameters_shareds = [ theano.shared(np.ndarray(shape=(0, ), dtype='float32'), name=name) for name in retina_model.model_params_names[:-1] ] ### predictions without sigma print 'Constucting loss:' print ' - Loss coefs:', loss_coefs print ' - True params shared:', self.true_parameters_shareds print ' - Predictions:', prediction[:-1] print ' - Sigma:', sigma_train pure_response, rmse = retina_model.parameter_response( loss_coefs, *self.true_parameters_shareds + prediction[:-1] + [sigma_train]) pure_loss = 1.0 - pure_response initial_response, initial_rmse = retina_model.parameter_response( loss_coefs, *self.true_parameters_shareds + self.inputs[:-1] + [sigma_train]) initial_loss = 1.0 - initial_response reg_c = T.fscalar('reg_c') alpha_rmse = T.fscalar('reg_c') loss = (1.0 - alpha_rmse) * pure_loss + alpha_rmse * rmse + reg_c * self.reg params = layers.get_all_params(self.out_layer) learning_rate = T.fscalar('learning rate') net_updates = updates.adadelta(loss, params, learning_rate=learning_rate) self._train = theano.function( self.inputs + [sigma_train, learning_rate, reg_c, alpha_rmse], [pure_loss, rmse, self.reg, loss, initial_loss, initial_rmse], updates=net_updates) self._loss = theano.function(self.inputs + [sigma_train], pure_loss) outputs = [v for it in iterations for v in it] self.ndim = len(self.inputs) self.predictions = theano.function(self.inputs, responses + outputs) self.responses = None self.traces = None self.seeds = None
def buildModel(self): print(' -- Building...') x_init = sparse.csr_matrix('x', dtype='float32') y_init = T.imatrix('y') gx_init = sparse.csr_matrix('gx', dtype='float32') gy_init = T.ivector('gy') gz_init = T.vector('gz') mask_init = T.fmatrix('subMask') # step train x_input = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=x_init) x_to_label = layers.SparseLayer(x_input, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) x_to_emd = layers.SparseLayer(x_input, self.embedding_size) W = x_to_emd.W x_to_emd = layers.DenseLayer(x_to_emd, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) x_concat = lgl.ConcatLayer([x_to_label, x_to_emd], axis=1) x_concat = layers.DenseLayer(x_concat, self.y.shape[1], nonlinearity=lg.nonlinearities.softmax) pred = lgl.get_output(x_concat) step_loss = lgo.categorical_crossentropy(pred, y_init).mean() hid_loss = lgl.get_output(x_to_label) step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean() emd_loss = lgl.get_output(x_to_emd) step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean() step_params = lgl.get_all_params(x_concat) step_updates = lg.updates.sgd(step_loss, step_params, learning_rate=self.step_learning_rate) self.step_train = theano.function([x_init, y_init], step_loss, updates=step_updates) self.test_fn = theano.function([x_init], pred) # supervised train gx_input = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=gx_init) gx_to_emd = layers.SparseLayer(gx_input, self.embedding_size, W=W) gx_to_emd = lgl.DenseLayer(gx_to_emd, self.num_ver, nonlinearity=lg.nonlinearities.softmax) gx_pred = lgl.get_output(gx_to_emd) g_loss = lgo.categorical_crossentropy(gx_pred, gy_init).sum() sup_params = lgl.get_all_params(gx_to_emd) sup_updates = lg.updates.sgd(g_loss, sup_params, learning_rate=self.sup_learning_rate) self.sup_train = theano.function([gx_init, gy_init, gz_init], g_loss, updates=sup_updates, on_unused_input='ignore') # handle lstm input cross_entropy = lgo.categorical_crossentropy(gx_pred, gy_init) cross_entropy = T.reshape(cross_entropy, (1, self.subpath_num), ndim=None) mask_input = lgl.InputLayer(shape=(None, self.window_size + 1), input_var=mask_init) sub_path_batch1 = sparse.csr_matrix('x', dtype='float32') sub_path_input1 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch1) sub_path_batch2 = sparse.csr_matrix('x', dtype='float32') sub_path_input2 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch2) sub_path_batch3 = sparse.csr_matrix('x', dtype='float32') sub_path_input3 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch3) sub_path_batch4 = sparse.csr_matrix('x', dtype='float32') sub_path_input4 = lgl.InputLayer(shape=(None, self.x.shape[1]), input_var=sub_path_batch4) sub_path_emd1 = layers.SparseLayer(sub_path_input1, self.embedding_size, W=W) sub_path_emd1 = T.reshape(lgl.get_output(sub_path_emd1), (self.subpath_num, 1, self.embedding_size)) sub_path_emd2 = layers.SparseLayer(sub_path_input2, self.embedding_size, W=W) sub_path_emd2 = T.reshape(lgl.get_output(sub_path_emd2), (self.subpath_num, 1, self.embedding_size)) sub_path_emd3 = layers.SparseLayer(sub_path_input3, self.embedding_size, W=W) sub_path_emd3 = T.reshape(lgl.get_output(sub_path_emd3), (self.subpath_num, 1, self.embedding_size)) sub_path_emd4 = layers.SparseLayer(sub_path_input4, self.embedding_size, W=W) sub_path_emd4 = T.reshape(lgl.get_output(sub_path_emd4), (self.subpath_num, 1, self.embedding_size)) sub_path_concat = T.concatenate([sub_path_emd1, sub_path_emd2, sub_path_emd3, sub_path_emd4], axis=1) sub_path_concat_layer = lgl.InputLayer(shape=(None, self.window_size + 1, self.embedding_size), input_var=sub_path_concat) # lstm layer lstm_layer = lgl.LSTMLayer(sub_path_concat_layer, self.lstm_hidden_units, grad_clipping=3, mask_input=mask_input) # handle path weight max1 = T.mean(lgl.get_output(lstm_layer), axis=1) max2 = T.mean(max1, axis=1) max2_init = T.fcol('max2') max2_init = T.reshape(max2, ((self.subpath_num, 1))) max2_input = lgl.InputLayer(shape=(self.subpath_num, 1), input_var=max2_init) max2_input = lgl.BatchNormLayer(max2_input) path_weight = lgl.get_output(max2_input) path_weight = lg.nonlinearities.sigmoid(path_weight) path_weight = 1 + 0.3 * path_weight # unsupervised train reweight_loss = T.dot(cross_entropy, path_weight)[0][0] lstm_params = lgl.get_all_params(lstm_layer, trainable=True) lstm_updates = lg.updates.sgd(reweight_loss, lstm_params, learning_rate=0.01) self.lstm_fn = theano.function([gx_init, gy_init, gz_init, sub_path_batch1, sub_path_batch2, sub_path_batch3, sub_path_batch4, mask_init], reweight_loss, updates=lstm_updates, on_unused_input='ignore') alpha_updates = lg.updates.sgd(reweight_loss, sup_params, learning_rate=0.001) self.alpha_fn = theano.function([gx_init, gy_init, gz_init, sub_path_batch1, sub_path_batch2, sub_path_batch3, sub_path_batch4, mask_init], reweight_loss, updates=alpha_updates, on_unused_input='ignore') print(' -- Done!')
l_output = DenseLayer(l_hidden, num_units=len(classes), nonlinearity=softmax, W=Constant()) # Now, we can generate the symbolic expression of the network's output given an input variable. net_input = T.matrix('net_input') net_output = l_output.get_output(net_input) # As a loss function, we'll use Theano's categorical_crossentropy function. # This allows for the network output to be class probabilities, # but the target output to be class labels. true_output = T.ivector('true_output') loss = T.mean(T.nnet.categorical_crossentropy(net_output, true_output)) # Retrieving all parameters of the network is done using get_all_params, # which recursively collects the parameters of all layers connected to the provided layer. all_params = get_all_params(l_output) # Now, we'll generate updates using Lasagne's SGD function updates = sgd(loss, all_params, learning_rate=0.01) # Finally, we can compile Theano functions for training and computing the output. training = function([net_input, true_output], loss, updates=updates) prediction = function([net_input], net_output) # Train for 100 epochs print 'epoch logloss' for k, n in enumerate(xrange(100)): # this is logloss res = training(trainT, classT) print '{0:.3d} {1:.4f}'.format(k, res) # Compute the predicted label of the training data. # The argmax converts the class probability output to class label
disc_layers = [ll.InputLayer(shape=(None, 3, 32, 32))] disc_layers.append(ll.DropoutLayer(disc_layers[-1], p=0.2)) disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 96, (3,3), pad=1, W=Normal(0.05), nonlinearity=nn.lrelu))) disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 96, (3,3), pad=1, W=Normal(0.05), nonlinearity=nn.lrelu))) disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 96, (3,3), pad=1, stride=2, W=Normal(0.05), nonlinearity=nn.lrelu))) disc_layers.append(ll.DropoutLayer(disc_layers[-1], p=0.5)) disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 192, (3,3), pad=1, W=Normal(0.05), nonlinearity=nn.lrelu))) disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 192, (3,3), pad=1, W=Normal(0.05), nonlinearity=nn.lrelu))) disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 192, (3,3), pad=1, stride=2, W=Normal(0.05), nonlinearity=nn.lrelu))) disc_layers.append(ll.DropoutLayer(disc_layers[-1], p=0.5)) disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 192, (3,3), pad=0, W=Normal(0.05), nonlinearity=nn.lrelu))) disc_layers.append(nn.weight_norm(ll.NINLayer(disc_layers[-1], num_units=192, W=Normal(0.05), nonlinearity=nn.lrelu))) disc_layers.append(nn.weight_norm(ll.NINLayer(disc_layers[-1], num_units=192, W=Normal(0.05), nonlinearity=nn.lrelu))) disc_layers.append(ll.GlobalPoolLayer(disc_layers[-1])) disc_layers.append(nn.weight_norm(ll.DenseLayer(disc_layers[-1], num_units=16, W=Normal(0.05), nonlinearity=None), train_g=True, init_stdv=0.1)) disc_params = ll.get_all_params(disc_layers, trainable=True) x_temp = T.tensor4() temp = ll.get_output(gen_layers[-1], deterministic=False, init=True) temp = ll.get_output(disc_layers[-1], x_temp, deterministic=False, init=True) init_updates = [u for l in gen_layers+disc_layers for u in getattr(l,'init_updates',[])] init_param = th.function(inputs=[x_temp], outputs=None, updates=init_updates) # costs labels = T.ivector() x_lab = T.tensor4() x_unl = T.tensor4() output_before_softmax_lab = ll.get_output(disc_layers[-1], x_lab, deterministic=False)
target_var = T.imatrix('targets'); input_layer_index = map(lambda pair : pair[0], ae.layers).index('input'); first_layer = ae.get_all_layers()[input_layer_index + 1]; input_layer = layers.InputLayer(shape=(None, 3, 32, 32), input_var=input_var); first_layer.input_layer = input_layer; encode_layer_index = map(lambda pair : pair[0], ae.layers).index('encode_layer'); encode_layer = ae.get_all_layers()[encode_layer_index]; fc_layer = layers.DenseLayer(incoming = encode_layer, num_units = 30, nonlinearity = rectify); network = layers.DenseLayer(incoming = fc_layer, num_units = classn, nonlinearity = sigmoid); prediction = layers.get_output(network); loss = lasagne.objectives.binary_crossentropy(prediction, target_var).mean(); params = layers.get_all_params(network, trainable=True); updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.0005, momentum=0.975); test_output = lasagne.layers.get_output(network, deterministic=True); test_loss = lasagne.objectives.binary_crossentropy(test_output, target_var).mean(); test_acc = T.mean(T.eq(T.gt(test_output, 0.5), target_var), dtype=theano.config.floatX); test_pred = T.gt(test_output, 0.5); train_fn = theano.function([input_var, target_var], loss, updates=updates); val_fn = theano.function([input_var, target_var], [test_loss, test_acc, test_pred]); print("Starting training..."); print("TrLoss\t\tVaLoss\t\tVaAcc\t\tEpochs\t\tTime"); sys.stdout.flush(); num_epochs = 300;
def train(self): self.G_weights_layer = nn.softmax_weights(self.args.ng, LL.InputLayer(shape=(), input_var=self.dummy_input)) self.D_weights_layer = nn.softmax_weights(self.args.ng, LL.InputLayer(shape=(), input_var=self.dummy_input)) self.G_weights = LL.get_output(self.G_weights_layer, None, deterministic=True) self.D_weights = LL.get_output(self.D_weights_layer, None, deterministic=True) self.Disc_weights_entropy = T.sum((-1./self.args.nd) * T.log(self.D_weights + 0.000001), [0,1]) self.Gen_weights_entropy = T.sum((-1./self.args.ng) * T.log(self.G_weights + 0.000001), [0,1]) for i in range(self.args.ng): gen_layers_i, gen_x_i = self.get_generator(self.meanx, self.z, self.y_1hot) self.G_layers.append(gen_layers_i) self.Gen_x_list.append(gen_x_i) self.Gen_x = T.concatenate(self.Gen_x_list, axis=0) for i in range(self.args.nd): disc_layers_i, disc_layer_adv_i, disc_layer_z_recon_i = self.get_discriminator() self.D_layers.append(disc_layers_i) self.D_layer_adv.append(disc_layer_adv_i) self.D_layer_z_recon.append(disc_layer_z_recon_i) #T.set_subtensor(self.Gen_x[i*self.args.batch_size:(i+1)*self.args.batch_size], gen_x_i) #self.samplers.append(self.sampler(self.z[i], self.y)) ''' forward pass ''' loss_gen0_cond_list = [] loss_disc0_class_list = [] loss_disc0_adv_list = [] loss_gen0_ent_list = [] loss_gen0_adv_list = [] #loss_disc_list for i in range(self.args.ng): self.y_recon_list.append(LL.get_output(self.enc_layer_fc4, self.Gen_x_list[i], deterministic=True)) # reconstructed pool3 activations for i in range(self.args.ng): #loss_gen0_cond = T.mean((recon_fc3_list[i] - self.real_fc3)**2) # feature loss, euclidean distance in feature space loss_gen0_cond = T.mean(T.nnet.categorical_crossentropy(self.y_recon_list[i], self.y)) loss_disc0_class = 0 loss_disc0_adv = 0 loss_gen0_ent = 0 loss_gen0_adv = 0 for j in range(self.args.nd): output_before_softmax_real0 = LL.get_output(self.D_layer_adv[j], self.x, deterministic=False) output_before_softmax_gen0, recon_z0 = LL.get_output([self.D_layer_adv[j], self.D_layer_z_recon[j]], self.Gen_x_list[i], deterministic=False) # discriminator's predicted probability that gen_x is real ''' loss for discriminator and Q ''' l_lab0 = output_before_softmax_real0[T.arange(self.args.batch_size),self.y] l_unl0 = nn.log_sum_exp(output_before_softmax_real0) l_gen0 = nn.log_sum_exp(output_before_softmax_gen0) loss_disc0_class += T.dot(self.D_weights[0,j], -T.mean(l_lab0) + T.mean(T.mean(nn.log_sum_exp(output_before_softmax_real0)))) # loss for not correctly classifying the category of real images loss_real0 = -T.mean(l_unl0) + T.mean(T.nnet.softplus(l_unl0)) # loss for classifying real as fake loss_fake0 = T.mean(T.nnet.softplus(l_gen0)) # loss for classifying fake as real loss_disc0_adv += T.dot(self.D_weights[0,j], 0.5*loss_real0 + 0.5*loss_fake0) loss_gen0_ent += T.dot(self.D_weights[0,j], T.mean((recon_z0 - self.z)**2)) #loss_gen0_ent = T.mean((recon_z0 - self.z)**2) ''' loss for generator ''' loss_gen0_adv += T.dot(self.D_weights[0,j], -T.mean(T.nnet.softplus(l_gen0))) loss_gen0_cond_list.append(T.dot(self.G_weights[0,i], loss_gen0_cond)) loss_disc0_class_list.append(T.dot(self.G_weights[0,i], loss_disc0_class)) loss_disc0_adv_list.append(T.dot(self.G_weights[0,i], loss_disc0_adv)) loss_gen0_ent_list.append(T.dot(self.G_weights[0,i], loss_gen0_ent)) loss_gen0_adv_list.append(T.dot(self.G_weights[0,i], loss_gen0_adv)) self.loss_gen0_cond = sum(loss_gen0_cond_list) self.loss_disc0_class = sum(loss_disc0_class_list) self.loss_disc0_adv = sum(loss_disc0_adv_list) self.loss_gen0_ent = sum(loss_gen0_ent_list) self.loss_gen0_adv = sum(loss_gen0_adv_list) self.loss_disc = self.args.labloss_weight * self.loss_disc0_class + self.args.advloss_weight * self.loss_disc0_adv + self.args.entloss_weight * self.loss_gen0_ent + self.args.mix_entloss_weight * self.Disc_weights_entropy self.loss_gen = self.args.advloss_weight * self.loss_gen0_adv + self.args.condloss_weight * self.loss_gen0_cond + self.args.entloss_weight * self.loss_gen0_ent + self.args.mix_entloss_weight * self.Gen_weights_entropy if self.args.load_epoch is not None: print("loading model") self.load_model(self.args.load_epoch) print("success") ''' collect parameter updates for discriminators ''' Disc_params = LL.get_all_params(self.D_weights_layer, trainable=True) Disc_bn_updates = [] Disc_bn_params = [] self.threshold = self.mincost + self.args.labloss_weight * self.loss_disc0_class + self.args.entloss_weight * self.loss_gen0_ent + self.args.mix_entloss_weight * self.Disc_weights_entropy #threshold = mincost + self.args.labloss_weight * self.loss_disc0_class + self.args.entloss_weight * self.loss_gen0_ent for i in range(self.args.nd): Disc_params.extend(LL.get_all_params(self.D_layers[i], trainable=True)) Disc_bn_updates.extend([u for l in LL.get_all_layers(self.D_layers[i][-1]) for u in getattr(l,'bn_updates',[])]) for l in LL.get_all_layers(self.D_layers[i][-1]): if hasattr(l, 'avg_batch_mean'): Disc_bn_params.append(l.avg_batch_mean) Disc_bn_params.append(l.avg_batch_var) Disc_param_updates = nn.adam_conditional_updates(Disc_params, self.loss_disc, mincost=self.threshold, lr=self.disc_lr, mom1=0.5) # if loss_disc_x < mincost, don't update the discriminator Disc_param_avg = [th.shared(np.cast[th.config.floatX](0.*p.get_value())) for p in Disc_params] # initialized with 0 Disc_avg_updates = [(a,a+0.0001*(p-a)) for p,a in zip(Disc_params, Disc_param_avg)] # online update of historical parameters """ #Disc_param_updates = nn.adam_updates(Disc_params, self.loss_disc, lr=self.lr, mom1=0.5) # collect parameters #Disc_params = LL.get_all_params(self.D_layers[-1], trainable=True) Disc_params = LL.get_all_params(self.D_layers, trainable=True) #Disc_param_updates = nn.adam_updates(Disc_params, loss_disc_x, lr=lr, mom1=0.5) # loss for discriminator = supervised_loss + unsupervised loss Disc_param_updates = nn.adam_conditional_updates(Disc_params, self.loss_disc, mincost=threshold, lr=self.disc_lr, mom1=0.5) # if loss_disc_x < mincost, don't update the discriminator Disc_param_avg = [th.shared(np.cast[th.config.floatX](0.*p.get_value())) for p in Disc_params] # initialized with 0 Disc_avg_updates = [(a,a+0.0001*(p-a)) for p,a in zip(Disc_params,Disc_param_avg)] # online update of historical parameters #Disc_avg_givens = [(p,a) for p,a in zip(Disc_params,Disc_param_avg)] Disc_bn_updates = [u for l in LL.get_all_layers(self.D_layers[-1]) for u in getattr(l,'bn_updates',[])] Disc_bn_params = [] for l in LL.get_all_layers(self.D_layers[-1]): if hasattr(l, 'avg_batch_mean'): Disc_bn_params.append(l.avg_batch_mean) Disc_bn_params.append(l.avg_batch_var) """ ''' collect parameter updates for generators ''' Gen_params = LL.get_all_params(self.G_weights_layer, trainable=True) Gen_params_updates = [] Gen_bn_updates = [] Gen_bn_params = [] for i in range(self.args.ng): Gen_params.extend(LL.get_all_params(self.G_layers[i][-1], trainable=True)) Gen_bn_updates.extend([u for l in LL.get_all_layers(self.G_layers[i][-1]) for u in getattr(l,'bn_updates',[])]) for l in LL.get_all_layers(self.G_layers[i][-1]): if hasattr(l, 'avg_batch_mean'): Gen_bn_params.append(l.avg_batch_mean) Gen_bn_params.append(l.avg_batch_var) Gen_param_updates = nn.adam_updates(Gen_params, self.loss_gen, lr=self.gen_lr, mom1=0.5) """ #print(Gen_params) #train_batch_gen = th.function(inputs=[self.x, self.meanx, self.z, self.y_1hot, self.lr], outputs=[self.loss_gen], on_unused_input='warn') #theano.printing.debugprint(train_batch_gen) Gen_param_updates = nn.adam_updates(Gen_params, self.loss_gen, lr=self.lr, mom1=0.5) Gen_params = LL.get_all_params(self.G_layers[-1], trainable=True) Gen_param_updates = nn.adam_updates(Gen_params, self.loss_gen, lr=self.gen_lr, mom1=0.5) Gen_bn_updates = [u for l in LL.get_all_layers(self.G_layers[-1]) for u in getattr(l,'bn_updates',[])] Gen_bn_params = [] for l in LL.get_all_layers(self.G_layers[-1]): if hasattr(l, 'avg_batch_mean'): Gen_bn_params.append(l.avg_batch_mean) Gen_bn_params.append(l.avg_batch_var) """ ''' define training and testing functions ''' #train_batch_disc = th.function(inputs=[x, meanx, y, lr], outputs=[loss_disc0_class, loss_disc0_adv, gen_x, x], # updates=disc0_param_updates+disc0_bn_updates) #th.printing.debugprint(self.loss_disc) train_batch_disc = th.function(inputs=[self.dummy_input, self.meanx, self.x, self.y, self.y_1hot, self.mincost, self.disc_lr], outputs=[self.loss_disc0_class, self.loss_disc0_adv], updates=Disc_param_updates+Disc_bn_updates+Disc_avg_updates) #th.printing.pydotprint(train_batch_disc, outfile="logreg_pydotprint_prediction.png", var_with_name_simple=True) #train_batch_gen = th.function(inputs=[x, meanx, y_1hot, lr], outputs=[loss_gen0_adv, loss_gen0_cond, loss_gen0_ent], # updates=gen0_param_updates+gen0_bn_updates) #train_batch_gen = th.function(inputs=gen_inputs, outputs=gen_outputs, updates=gen0_param_updates+gen0_bn_updates) #train_batch_gen = th.function(inputs=[self.dummy_input, self.x, self.meanx, self.z, self.y_1hot, self.lr], outputs=[self.loss_gen0_adv, self.loss_gen0_cond, self.loss_gen0_ent], updates=Gen_param_updates+Gen_bn_updates) train_batch_gen = th.function(inputs=[self.dummy_input, self.meanx, self.y, self.y_1hot, self.gen_lr], outputs=[self.loss_gen0_adv, self.loss_gen0_cond, self.loss_gen0_ent], updates=Gen_param_updates+Gen_bn_updates) # samplefun = th.function(inputs=[meanx, y_1hot], outputs=gen_x_joint) # sample function: generating images by stacking all generators reconfun = th.function(inputs=[self.meanx, self.y_1hot], outputs=self.Gen_x) # reconstruction function: use the bottom generator # to generate images conditioned on real fc3 features mix_weights = th.function(inputs=[self.dummy_input], outputs=[self.D_weights, self.Disc_weights_entropy, self.G_weights, self.Gen_weights_entropy]) ''' load data ''' print("Loading data...") meanimg, data = load_cifar_data(self.args.data_dir) trainx = data['X_train'] trainy = data['Y_train'] nr_batches_train = int(trainx.shape[0]/self.args.batch_size) # testx = data['X_test'] # testy = data['Y_test'] # nr_batches_test = int(testx.shape[0]/self.args.batch_size) ''' perform training ''' #logs = {'loss_gen0_adv': [], 'loss_gen0_cond': [], 'loss_gen0_ent': [], 'loss_disc0_class': [], 'var_gen0': [], 'var_real0': []} # training logs logs = {'loss_gen0_adv': [], 'loss_gen0_cond': [], 'loss_gen0_ent': [], 'loss_disc0_class': []} # training logs for epoch in range(self.args.load_epoch+1, self.args.num_epoch): begin = time.time() ''' shuffling ''' inds = rng.permutation(trainx.shape[0]) trainx = trainx[inds] trainy = trainy[inds] for t in range(nr_batches_train): #for t in range(1): ''' construct minibatch ''' #batchz = np.random.uniform(size=(self.args.batch_size, self.args.z0dim)).astype(np.float32) batchx = trainx[t*self.args.batch_size:(t+1)*self.args.batch_size] batchy = trainy[t*self.args.batch_size:(t+1)*self.args.batch_size] batchy_1hot = np.zeros((self.args.batch_size, 10), dtype=np.float32) batchy_1hot[np.arange(self.args.batch_size), batchy] = 1 # convert to one-hot label # randomy = np.random.randint(10, size = (self.args.batch_size,)) # randomy_1hot = np.zeros((self.args.batch_size, 10),dtype=np.float32) # randomy_1hot[np.arange(self.args.batch_size), randomy] = 1 ''' train discriminators ''' l_disc0_class, l_disc0_adv = train_batch_disc(0.0, meanimg, batchx, batchy, batchy_1hot, self.args.mincost, self.args.disc_lr) ''' train generators ''' #prob_gen0 = np.exp() if l_disc0_adv > 0.65: n_iter = 1 elif l_disc0_adv > 0.5: n_iter = 3 elif l_disc0_adv > 0.3: n_iter = 5 else: n_iter = 7 for i in range(n_iter): #l_gen0_adv, l_gen0_cond, l_gen0_ent = train_batch_gen(0.0, batchx, meanimg, batchz, batchy_1hot, self.args.gen_lr) l_gen0_adv, l_gen0_cond, l_gen0_ent = train_batch_gen(0.0, meanimg, batchy, batchy_1hot, self.args.gen_lr) d_mix_weights, d_entloss, g_mix_weights, g_entloss = mix_weights(0.0) ''' store log information ''' # logs['loss_gen1_adv'].append(l_gen1_adv) # logs['loss_gen1_cond'].append(l_gen1_cond) # logs['loss_gen1_ent'].append(l_gen1_ent) # logs['loss_disc1_class'].append(l_disc1_class) # logs['var_gen1'].append(np.var(np.array(g1))) # logs['var_real1'].append(np.var(np.array(r1))) logs['loss_gen0_adv'].append(l_gen0_adv) logs['loss_gen0_cond'].append(l_gen0_cond) logs['loss_gen0_ent'].append(l_gen0_ent) logs['loss_disc0_class'].append(l_disc0_class) #logs['var_gen0'].append(np.var(np.array(g0))) #logs['var_real0'].append(np.var(np.array(r0))) print("---Epoch %d, time = %ds" % (epoch, time.time()-begin)) print("D_weights=[%.6f, %.6f, %.6f, %.6f, %.6f] loss = %0.6f" % (d_mix_weights[0,0], d_mix_weights[0,1], d_mix_weights[0,2], d_mix_weights[0,3], d_mix_weights[0,4], d_entloss)) print("G_weights=[%.6f, %.6f, %.6f, %.6f, %.6f] loss = %0.6f" % (g_mix_weights[0,0], g_mix_weights[0,1], g_mix_weights[0,2], g_mix_weights[0,3], g_mix_weights[0,4], g_entloss)) #print("G_weights=[%.6f]" % (g_mix_weights[0,0])) print("loss_disc0_adv = %.4f, loss_gen0_adv = %.4f, loss_gen0_cond = %.4f, loss_gen0_ent = %.4f, loss_disc0_class = %.4f" % (l_disc0_adv, l_gen0_adv, l_gen0_cond, l_gen0_ent, l_disc0_class)) # ''' sample images by stacking all generators''' # imgs = samplefun(meanimg, refy_1hot) # imgs = np.transpose(np.reshape(imgs[:100,], (100, 3, 32, 32)), (0, 2, 3, 1)) # imgs = [imgs[i] for i in range(100)] # rows = [] # for i in range(10): # rows.append(np.concatenate(imgs[i::10], 1)) # imgs = np.concatenate(rows, 0) # scipy.misc.imsave(self.args.out_dir + "/mnist_sample_epoch{}.png".format(epoch), imgs) """ ''' original images in the training set''' orix = np.transpose(np.reshape(batchx[:100,], (100, 3, 32, 32)), (0, 2, 3, 1)) orix = [orix[i] for i in range(100)] rows = [] for i in range(10): rows.append(np.concatenate(orix[i::10], 1)) orix = np.concatenate(rows, 0) scipy.misc.imsave(self.args.out_dir + "/mnist_ori_epoch{}.png".format(epoch), orix) """ if epoch%self.args.save_interval==0: # np.savez(self.args.out_dir + "/disc1_params_epoch{}.npz".format(epoch), *LL.get_all_param_values(disc1_layers[-1])) # np.savez(self.args.out_dir + '/gen1_params_epoch{}.npz'.format(epoch), *LL.get_all_param_values(gen1_layers[-1])) #np.savez(self.args.out_dir + "/disc0_params_epoch{}.npz".format(epoch), *LL.get_all_param_values(disc0_layers)) #np.savez(self.args.out_dir + '/gen0_params_epoch{}.npz'.format(epoch), *LL.get_all_param_values(gen0_layers)) np.savez(self.args.out_dir + '/Dweights_params_epoch{}.npz'.format(epoch), *LL.get_all_param_values(self.D_weights_layer)) np.savez(self.args.out_dir + '/Gweights_params_epoch{}.npz'.format(epoch), *LL.get_all_param_values(self.G_weights_layer)) for i in range(self.args.ng): np.savez(self.args.out_dir + ("/disc%d_params_epoch%d.npz" % (i,epoch)), *LL.get_all_param_values(self.D_layers[i])) np.savez(self.args.out_dir + ("/gen%d_params_epoch%d.npz" % (i,epoch)), *LL.get_all_param_values(self.G_layers[i])) np.save(self.args.out_dir + '/logs.npy',logs) ''' reconstruct images ''' reconx = reconfun(meanimg, batchy_1hot) + meanimg width = np.round(np.sqrt(self.args.batch_size)).astype(int) for i in range(self.args.ng): reconx_i = np.transpose(np.reshape(reconx[i*self.args.batch_size:(i+1)*self.args.batch_size], (self.args.batch_size, 3, 32, 32)), (0, 2, 3, 1)) reconx_i = [reconx_i[j] for j in range(self.args.batch_size)] rows = [] for j in range(width): rows.append(np.concatenate(reconx_i[j::width], 1)) reconx_i = np.concatenate(rows, 0) scipy.misc.imsave(self.args.out_dir + ("/cifar_recon_%d_epoch%d.png"%(i,epoch)), reconx_i)
def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size, dropout, l2, mode, batch_norm, **kwargs): print("==> not used params in DMN class:", kwargs.keys()) self.train_list_raw = train_list_raw self.test_list_raw = test_list_raw self.png_folder = png_folder self.batch_size = batch_size self.dropout = dropout self.l2 = l2 self.mode = mode self.batch_norm = batch_norm self.input_var = T.tensor4('input_var') self.answer_var = T.ivector('answer_var') print("==> building network") example = np.random.uniform(size=(self.batch_size, 1, 128, 768), low=0.0, high=1.0).astype(np.float32) ######### answer = np.random.randint(low=0, high=176, size=(self.batch_size, )) ######### network = layers.InputLayer(shape=(None, 1, 128, 768), input_var=self.input_var) print(layers.get_output(network).eval({self.input_var: example}).shape) # CONV-RELU-POOL 1 network = layers.Conv2DLayer(incoming=network, num_filters=16, filter_size=(7, 7), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({self.input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({self.input_var: example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 2 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(5, 5), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({self.input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({self.input_var: example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 3 network = layers.Conv2DLayer(incoming=network, num_filters=32, filter_size=(3, 3), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({self.input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({self.input_var: example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 4 network = layers.Conv2DLayer(incoming=network, num_filters=64, filter_size=(3, 3), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({self.input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({self.input_var: example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # CONV-RELU-POOL 5 network = layers.Conv2DLayer(incoming=network, num_filters=64, filter_size=(3, 3), stride=1, nonlinearity=rectify) print(layers.get_output(network).eval({self.input_var: example}).shape) network = layers.MaxPool2DLayer(incoming=network, pool_size=(3, 3), stride=2, pad=2) print(layers.get_output(network).eval({self.input_var: example}).shape) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) # DENSE 1 #network = layers.DenseLayer(incoming=network, num_units=256, nonlinearity=rectify) network = layers.DenseLayer(incoming=network, num_units=6144, nonlinearity=rectify) if (self.batch_norm): network = layers.BatchNormLayer(incoming=network) if (self.dropout > 0): network = layers.dropout(network, self.dropout) print(layers.get_output(network).eval({self.input_var: example}).shape) # Last layer: classification network = layers.DenseLayer(incoming=network, num_units=176, nonlinearity=softmax) print(layers.get_output(network).eval({self.input_var: example}).shape) self.params = layers.get_all_params(network, trainable=True) self.prediction = layers.get_output(network) self.test_prediction = layers.get_output(network, deterministic=True) print("==> param shapes", [x.eval().shape for x in self.params]) def get_loss(prediction): loss_ce = lasagne.objectives.categorical_crossentropy( prediction, self.answer_var).mean() if (self.l2 > 0): loss_l2 = self.l2 * lasagne.regularization.regularize_network_params( network, lasagne.regularization.l2) else: loss_l2 = 0 return loss_ce + loss_l2 self.loss = get_loss(self.prediction) self.test_loss = get_loss(self.test_prediction) #updates = lasagne.updates.adadelta(self.loss, self.params) updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.003) if self.mode == 'train': print("==> compiling train_fn") self.train_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss], updates=updates) print("==> compiling test_fn") # deterministic version #self.test_fn = theano.function(inputs=[self.input_var, self.answer_var], # outputs=[self.test_prediction, self.test_loss]) # non deterministic version, as train_fn self.test_fn = theano.function( inputs=[self.input_var, self.answer_var], outputs=[self.prediction, self.loss])
#compiling theano functions: evaluate_generator = theano.function([z_var], get_output(generator), allow_input_downcast=True) sample_generator = theano.function( [batchsize_var], samples_from_grenerator, allow_input_downcast=True, ) sample_prior = theano.function([prior_variance_var, batchsize_var], samples_from_prior, allow_input_downcast=True) params_D = get_all_params(discriminator['norm'], trainable=True) updates_D = adam(loss_D, params_D, learning_rate=learningrate_var) train_D = theano.function( [learningrate_var, batchsize_var, prior_variance_var], loss_D, updates=updates_D, allow_input_downcast=True) params_G = get_all_params(generator, trainable=True) updates_G = adam(loss_G, params_G, learning_rate=learningrate_var) train_G = theano.function([x_var, y_var, learningrate_var, batchsize_var], loss_G,
# target theano variable indicatind the index a vertex should be mapped to wrt the latent space target = T.ivector('idxs') # to work with logit predictions, better behaved numerically cla = utils_lasagne.categorical_crossentropy_logdomain(output, target, nclasses).mean() acc = LO.categorical_accuracy(pred, target).mean() # a bit of regularization is commonly used regL2 = L.regularization.regularize_network_params(ffn, L.regularization.l2) cost = cla + l2_weight * regL2 ''' Define the update rule, how to train ''' params = LL.get_all_params(ffn, trainable=True) grads = T.grad(cost, params) # computes the L2 norm of the gradient to better inspect training grads_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grads]), 2) # Adam turned out to be a very good choice for correspondence updates = L.updates.adam(grads, params, learning_rate=0.001) ''' Compile ''' funcs = dict() funcs['train'] = theano.function( [inp.input_var, patch_op.input_var, target], [cost, cla, l2_weight * regL2, grads_norm, acc], updates=updates, on_unused_input='warn') funcs['acc_loss'] = theano.function(
def main(): parser = argparse.ArgumentParser( description='Tuning with bi-directional LSTM-CNN-CRF') parser.add_argument('--fine_tune', action='store_true', help='Fine tune the word embeddings') parser.add_argument( '--embedding', choices=['word2vec', 'glove', 'senna', 'random', 'polyglot'], help='Embedding for words', required=True) parser.add_argument('--embedding_dict', default=None, help='path for embedding dict') parser.add_argument('--batch_size', type=int, default=10, help='Number of sentences in each batch') parser.add_argument('--num_units', type=int, default=100, help='Number of hidden units in LSTM') parser.add_argument('--num_filters', type=int, default=20, help='Number of filters in CNN') parser.add_argument('--learning_rate', type=float, default=0.1, help='Learning rate') parser.add_argument('--decay_rate', type=float, default=0.1, help='Decay rate of learning rate') parser.add_argument('--grad_clipping', type=float, default=0, help='Gradient clipping') parser.add_argument('--gamma', type=float, default=1e-6, help='weight for regularization') parser.add_argument('--peepholes', action='store_true', help='Peepholes for LSTM') parser.add_argument('--oov', choices=['random', 'embedding'], help='Embedding for oov word', required=True) parser.add_argument( '--update', choices=['sgd', 'momentum', 'nesterov', 'adadelta', 'adam'], help='update algorithm', default='sgd') parser.add_argument('--regular', choices=['none', 'l2'], help='regularization for training', required=True) parser.add_argument('--dropout', action='store_true', help='Apply dropout layers') parser.add_argument('--patience', type=int, default=5, help='Patience for early stopping') parser.add_argument('--output_prediction', action='store_true', help='Output predictions to temp files') parser.add_argument('--train') parser.add_argument('--dev') parser.add_argument('--test') parser.add_argument('--exp_dir') parser.add_argument('--adv', type=float, default=0) parser.add_argument('--seed', type=int, default=1234) parser.add_argument('--reload', default=None, help='path for reloading') args = parser.parse_args() np.random.seed(args.seed) lasagne.random.set_rng(np.random) def construct_input_layer(): if fine_tune: layer_input = lasagne.layers.InputLayer(shape=(None, max_length), input_var=input_var, name='input') layer_embedding = Normalized_EmbeddingLayer( layer_input, input_size=alphabet_size, output_size=embedd_dim, vocab_freqs=word_freqs, W=embedd_table, name='embedding') raw_layer = layer_embedding else: layer_input = lasagne.layers.InputLayer(shape=(None, max_length, embedd_dim), input_var=input_var, name='input') raw_layer = layer_input return raw_layer # [batch, max_sent_length, embedd_dim] def construct_char_input_layer(): layer_char_input = lasagne.layers.InputLayer(shape=(None, max_sent_length, max_char_length), input_var=char_input_var, name='char-input') layer_char_input = lasagne.layers.reshape( layer_char_input, (-1, [2])) # [batch * max_sent_length, max_char_length] layer_char_embedding = Normalized_EmbeddingLayer( layer_char_input, input_size=char_alphabet_size, output_size=char_embedd_dim, vocab_freqs=char_freqs, W=char_embedd_table, name='char_embedding' ) # [n_examples, max_char_length, char_embedd_dim] #layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # [n_examples, char_embedd_dim, max_char_length] return layer_char_embedding logger = utils.get_logger("BiLSTM-BiLSTM-CRF") fine_tune = args.fine_tune oov = args.oov regular = args.regular embedding = args.embedding embedding_path = args.embedding_dict train_path = args.train dev_path = args.dev test_path = args.test update_algo = args.update grad_clipping = args.grad_clipping peepholes = args.peepholes gamma = args.gamma output_predict = args.output_prediction dropout = args.dropout exp_dir = args.exp_dir if not os.path.isdir(exp_dir): os.mkdir(exp_dir) exp_name = exp_dir.split('/')[-1] exp_mode = exp_name.split('_')[0] # 'pos' or 'ner', etc. save_dir = os.path.join(exp_dir, 'save') eval_dir = os.path.join(exp_dir, 'eval') if not os.path.isdir(save_dir): os.mkdir(save_dir) if not os.path.isdir(eval_dir): os.mkdir(eval_dir) eval_script = "./conlleval" if exp_mode == 'pos': (word_col_in_data, label_col_in_data) = (0, 1) elif exp_mode == 'ner': (word_col_in_data, label_col_in_data) = (0, 3) elif exp_mode == 'chunk': (word_col_in_data, label_col_in_data) = (0, 2) else: (word_col_in_data, label_col_in_data) = (1, 3) # assume CoNLL-U style # load data X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \ (embedd_table, word_freqs), label_alphabet, \ C_train, C_dev, C_test, (char_embedd_table, char_freqs) = data_processor.load_dataset_sequence_labeling(train_path, dev_path, test_path, word_col_in_data, label_col_in_data, label_name=exp_mode, oov=oov, fine_tune=True, embedding=embedding, embedding_path=embedding_path, use_character=True) num_labels = label_alphabet.size() - 1 logger.info("constructing network...") # create variables target_var = T.imatrix(name='targets') mask_var = T.matrix(name='masks', dtype=theano.config.floatX) num_tokens = mask_var.sum(dtype=theano.config.floatX) if fine_tune: input_var = T.imatrix(name='inputs') num_data, max_length = X_train.shape alphabet_size, embedd_dim = embedd_table.shape else: input_var = T.tensor3(name='inputs', dtype=theano.config.floatX) num_data, max_length, embedd_dim = X_train.shape char_input_var = T.itensor3(name='char-inputs') num_data_char, max_sent_length, max_char_length = C_train.shape char_alphabet_size, char_embedd_dim = char_embedd_table.shape assert (max_length == max_sent_length) assert (num_data == num_data_char) # prepare initial input layer and embeddings char_layer = construct_char_input_layer() word_layer = construct_input_layer() char_emb = Lyrs.get_output(char_layer) word_emb = Lyrs.get_output(word_layer) # construct input and mask layers char_in_layer = Lyrs.InputLayer(shape=(None, max_char_length, char_embedd_dim)) word_in_layer = Lyrs.InputLayer(shape=(None, max_length, embedd_dim)) layer_mask = lasagne.layers.InputLayer(shape=(None, max_length), input_var=mask_var, name='mask') # construct bilstm_bilstm_crf num_units = args.num_units num_filters = args.num_filters logger.info("Network structure: hidden=%d, filter=%d" % (num_units, num_filters)) bilstm_bilstm_crf = build_BiLSTM_BiLSTM_CRF(char_in_layer, word_in_layer, num_units, num_labels, mask=layer_mask, grad_clipping=grad_clipping, peepholes=peepholes, num_filters=num_filters, dropout=dropout) # compute loss def loss_from_embedding(char_emb, word_emb, deterministic=False, return_all=True): # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels] energies = Lyrs.get_output(bilstm_bilstm_crf, inputs={ char_in_layer: char_emb, word_in_layer: word_emb }, deterministic=deterministic) loss = crf_loss(energies, target_var, mask_var).mean() if return_all: predict, corr = crf_accuracy(energies, target_var) corr = (corr * mask_var).sum(dtype=theano.config.floatX) return loss, predict, corr else: return loss loss_eval, prediction_eval, corr_eval = loss_from_embedding( char_emb, word_emb, deterministic=True) loss_train_ori, _, corr_train = loss_from_embedding(char_emb, word_emb) if args.adv: logger.info('Preparing adversarial training...') loss_train_adv = adversarial_loss(char_emb, word_emb, loss_from_embedding, loss_train_ori, perturb_scale=args.adv) loss_train = (loss_train_ori + loss_train_adv) / 2.0 else: loss_train_adv = T.as_tensor_variable( np.asarray(0.0, dtype=theano.config.floatX)) loss_train = loss_train_ori + loss_train_adv # l2 regularization? if regular == 'l2': l2_penalty = lasagne.regularization.regularize_network_params( bilstm_bilstm_crf, lasagne.regularization.l2) loss_train = loss_train + gamma * l2_penalty # Create update expressions for training. # hyper parameters to tune: learning rate, momentum, regularization. batch_size = args.batch_size learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate decay_rate = args.decay_rate momentum = 0.9 params = Lyrs.get_all_params( bilstm_bilstm_crf, trainable=True) + Lyrs.get_all_params( char_layer, trainable=True) + Lyrs.get_all_params(word_layer, trainable=True) updates = utils.create_updates(loss_train, params, update_algo, learning_rate, momentum=momentum) # Compile a function performing a training step on a mini-batch train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train_ori, loss_train_adv, corr_train, num_tokens], updates=updates) # Compile a second function evaluating the loss and accuracy of network eval_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_eval, corr_eval, num_tokens, prediction_eval]) # reload saved model if args.reload is not None: logger.info('Reloading saved parameters from %s ...\n' % args.reload) with np.load(args.reload) as f: param_values = [f['arr_%d' % j] for j in range(len(f.files))] Lyrs.set_all_param_values(word_layer, param_values[0:1]) Lyrs.set_all_param_values(char_layer, param_values[1:2]) Lyrs.set_all_param_values(bilstm_bilstm_crf, param_values[2:]) # Finally, launch the training loop. logger.info( "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s) ..." \ % ( update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size, grad_clipping, peepholes)) num_batches = num_data / batch_size num_epochs = 1000 best_acc = np.array([0.0, 0.0, 0.0]) best_epoch_acc = np.array([0, 0, 0]) best_acc_test_err = np.array([0.0, 0.0, 0.0]) best_acc_test_corr = np.array([0.0, 0.0, 0.0]) stop_count = 0 lr = learning_rate patience = args.patience for epoch in range(1, num_epochs + 1): print print 'Epoch %d (learning rate=%.7f, decay rate=%.4f): ' % (epoch, lr, decay_rate) train_err_ori = 0.0 train_err_adv = 0.0 train_corr = 0.0 train_total = 0 train_inst = 0 start_time = time.time() num_back = 0 train_batches = 0 epoch_save_dir = os.path.join(save_dir, 'epoch%d' % epoch) os.mkdir(epoch_save_dir) for batch in utils.iterate_minibatches(X_train, Y_train, masks=mask_train, char_inputs=C_train, batch_size=batch_size, shuffle=True): inputs, targets, masks, char_inputs = batch err_ori, err_adv, corr, num = train_fn(inputs, targets, masks, char_inputs) train_err_ori += err_ori * inputs.shape[0] train_err_adv += err_adv * inputs.shape[0] train_corr += corr train_total += num train_inst += inputs.shape[0] train_batches += 1 time_ave = (time.time() - start_time) / train_batches time_left = (num_batches - train_batches) * time_ave # update log if train_batches % (num_batches // 10) == 0: log_info = 'train: %d/%d L_ori: %.4f, L_adv: %.4f, acc: %.2f%%, time left: %.2fs\n' % ( min(train_batches * batch_size, num_data), num_data, train_err_ori / train_inst, train_err_adv / train_inst, train_corr * 100 / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() # save the parameter values #param_values = Lyrs.get_all_param_values(bilstm_bilstm_crf) #np.savez(epoch_save_dir + '/iter%d.npz' % train_batches, *param_values) # save the parameter values param_values = Lyrs.get_all_param_values( word_layer) + Lyrs.get_all_param_values( char_layer) + Lyrs.get_all_param_values(bilstm_bilstm_crf) np.savez(epoch_save_dir + '/final.npz', *param_values) # update training log after each epoch assert train_inst == num_data print 'train: %d/%d L_ori: %.4f, L_adv: %.4f, acc: %.2f%%, time: %.2fs' % ( min(train_batches * batch_size, num_data), num_data, train_err_ori / train_inst, train_err_adv / train_inst, train_corr * 100 / train_total, time.time() - start_time) # evaluate performance on dev data dev_err = 0.0 dev_corr = 0.0 dev_total = 0 dev_inst = 0 for batch in utils.iterate_minibatches(X_dev, Y_dev, masks=mask_dev, char_inputs=C_dev, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) dev_err += err * inputs.shape[0] dev_corr += corr dev_total += num dev_inst += inputs.shape[0] if output_predict: output_file = eval_dir + '/dev%d' % epoch utils.output_predictions(predictions, targets, masks, output_file, label_alphabet, is_flattened=False) print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( dev_err / dev_inst, dev_corr, dev_total, dev_corr * 100 / dev_total) #update_loss = False update_acc = False if best_acc.min() > dev_corr / dev_total: stop_count += 1 else: stop_count = 0 if best_acc.min() < dev_corr / dev_total: update_acc = True idx_to_update = best_acc.argmin() best_acc[idx_to_update] = dev_corr / dev_total best_epoch_acc[idx_to_update] = epoch # evaluate on test data test_err = 0.0 test_corr = 0.0 test_total = 0 test_inst = 0 for batch in utils.iterate_minibatches(X_test, Y_test, masks=mask_test, char_inputs=C_test, batch_size=batch_size): inputs, targets, masks, char_inputs = batch err, corr, num, predictions = eval_fn(inputs, targets, masks, char_inputs) test_err += err * inputs.shape[0] test_corr += corr test_total += num test_inst += inputs.shape[0] if output_predict: output_file = eval_dir + '/test%d' % epoch utils.output_predictions(predictions, targets, masks, output_file, label_alphabet, is_flattened=False) # print out test result if stop_count > 0: print '(cf.', print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( test_err / test_inst, test_corr, test_total, test_corr * 100 / test_total), if output_predict and exp_mode in ['ner', 'chunk']: stdout = subprocess.check_output([eval_script], stdin=open(output_file)) f1_score = stdout.split("\n")[1].split()[7] # this is string print ", f1:", f1_score else: print sys.stdout.flush() if update_acc: best_acc_test_err[idx_to_update] = test_err best_acc_test_corr[idx_to_update] = test_corr # stop if dev acc decrease 3 time straightly. if stop_count == patience: break # re-compile a function with new learning rate for training if update_algo not in ['adam', 'adadelta']: if decay_rate >= 0: lr = learning_rate / (1.0 + epoch * decay_rate) else: if stop_count > 0 and stop_count % 3 == 0: learning_rate /= 2.0 lr = learning_rate updates = utils.create_updates(loss_train, params, update_algo, lr, momentum=momentum) train_fn = theano.function( [input_var, target_var, mask_var, char_input_var], [loss_train_ori, loss_train_adv, corr_train, num_tokens], updates=updates) # print best performance on test data. for i in range(len(best_epoch_acc)): logger.info("final best acc test performance (at epoch %d)" % best_epoch_acc[i]) print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % ( best_acc_test_err[i] / test_inst, best_acc_test_corr[i], test_total, best_acc_test_corr[i] * 100 / test_total)
def build_treatment_model(self, n_vars, **kwargs): input_vars = TT.matrix() instrument_vars = TT.matrix() targets = TT.vector() inputs = layers.InputLayer((None, n_vars), input_vars) inputs = layers.DropoutLayer(inputs, p=0.2) dense_layer = layers.DenseLayer(inputs, 2 * kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) dense_layer = layers.DropoutLayer(dense_layer, p=0.2) for _ in xrange(kwargs['n_dense_layers'] - 1): dense_layer = layers.DenseLayer( dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.rectify) dense_layer = layers.batch_norm(dense_layer) self.treatment_output = layers.DenseLayer( dense_layer, 1, nonlinearity=nonlinearities.linear) init_params = layers.get_all_param_values(self.treatment_output) prediction = layers.get_output(self.treatment_output, deterministic=False) test_prediction = layers.get_output(self.treatment_output, deterministic=True) l2_cost = regularization.regularize_network_params( self.treatment_output, regularization.l2) loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost params = layers.get_all_params(self.treatment_output, trainable=True) param_updates = updates.adadelta(loss, params) self._train_fn = theano.function([ input_vars, targets, instrument_vars, ], loss, updates=param_updates) self._loss_fn = theano.function( [ input_vars, targets, instrument_vars, ], loss, ) self._output_fn = theano.function( [ input_vars, ], test_prediction, ) return init_params
def params(self): return get_all_params(self.l_out, trainable=True)
def build_graph(self): print 'building models...' y_gold = T.lvector('y_gold') # index of the correct action from oracle z_gold = T.lvector('z_gold') # index of the correct label from oracle sidx = T.lvector('sidx') # sentence ids of the batch tidx = T.lmatrix('tidx') # token ids in each sentence of the batch valid = T.fmatrix('valid') # valid action mask self.step = theano.shared(np.array(0.).astype(theano.config.floatX), name='step') lr = self.args.learn_rate * self.args.decay**T.floor(self.step / 2000.) self.actor, self.labeler, self.taggers = self.get_actor(sidx, tidx, valid, avg=False) self.actor_avg, self.labeler_avg, self.taggers_avg = self.get_actor( sidx, tidx, valid, avg=True) # averaged model for prediction actor_prob, labeler_prob = L.get_output( [self.actor_avg, self.labeler_avg], deterministic=True) actor_rest = actor_prob * valid # mask the probabilities of invalid actions to 0 actor_rest_normalized = actor_rest / T.sum( actor_rest, axis=1, keepdims=True) preds_avg = [] if self.args.aux_tagger: for name in self.args.target_feats: prob_avg = L.get_output(self.taggers_avg[name], deterministic=True) # (100, 25) pred_avg = T.argmax(prob_avg, axis=1) # (100, ) preds_avg.append(pred_avg) self.actor_predict_avg = theano.function( [sidx, tidx, valid], [actor_rest_normalized, labeler_prob] + preds_avg, on_unused_input='ignore', allow_input_downcast=True) # training # only compile if in training mode (has training data) if self.args.train: # parser objectives y_prob, z_prob = L.get_output([self.actor, self.labeler], deterministic=False) y_xent = categorical_crossentropy(y_prob, y_gold) z_xent = categorical_crossentropy(z_prob, z_gold) y_pred = T.argmax(y_prob, 1) z_pred = T.argmax(z_prob, 1) z_mask = T.eq(y_pred, y_gold) & T.lt(y_gold, self.args.idsh) acc_y = T.mean(T.cast(T.eq(y_pred, y_gold), theano.config.floatX)) acc_z = T.cast(T.sum(T.eq(z_pred, z_gold) * z_mask) + 1., theano.config.floatX)\ / T.cast(T.sum(z_mask) + 1., theano.config.floatX) cost = T.mean(y_xent) + T.mean(z_xent * z_mask) params = L.get_all_params([self.actor, self.labeler] + self.taggers.values(), trainable='True') avg_params = L.get_all_params([self.actor_avg, self.labeler_avg] + self.taggers_avg.values(), trainable='True') # accuracy of all auxiliary tasks acc_w = acc_y - acc_y # joint objective for aux tagger if self.args.aux_tagger: # tags of s0 are the targets for name in self.args.target_feats: w_gold = self.manager.feats[name].data[ sidx.dimshuffle(0, 'x'), tidx][:, 0] # (100, ) w_prob = L.get_output(self.taggers[name], deterministic=False) w_xent = categorical_crossentropy(w_prob, w_gold) w_mask = T.neq(w_gold, 0) cost += self.args.aux_ratio * T.mean(w_xent * w_mask) w_pred = T.argmax(w_prob, axis=1) acc = T.cast(T.sum(T.eq(w_pred, w_gold) * w_mask) + 1., theano.config.floatX)\ / T.cast(T.sum(w_mask) + 1., theano.config.floatX) acc_w += acc / len(self.args.target_feats) reg = regularize_network_params( L.get_all_layers([self.actor, self.labeler] + self.taggers.values()), l2) cost += self.args.reg_rate * reg updates = lasagne.updates.momentum(cost, params, lr, self.args.momentum) updates = apply_moving_average(params, avg_params, updates, self.step, 0.9999) self.train_parser = theano.function( [y_gold, z_gold, sidx, tidx, valid], [acc_y, acc_z, acc_w, cost], updates=updates, on_unused_input='ignore', allow_input_downcast=True)
##### On passing real images to the discriminator D_loss_real = sigmoid_cross_entropy_with_logits_v1(D_real, 1) ##### On passing fake images to the discriminator D_loss_fake = sigmoid_cross_entropy_with_logits_v1(D_fake, 0) #### Taking mean of the two D_loss = D_loss_real.mean() + D_loss_fake.mean() #Generator Loss (Discriminator is frozen i.e. not updated) #### On passing fake images to the discriminator G_loss_fake = sigmoid_cross_entropy_with_logits_v1(D_fake, 1) G_loss = G_loss_fake.mean() #Finding Paramters for training D_theta = LL.get_all_params(dis_layers[-1], trainable=True) G_theta = LL.get_all_params(gen_layers[-1], trainable=True) #Updating the paramteres with Adam Optimizer with default parameters D_solver = lasagne.updates.adam(D_loss, D_theta) G_solver = lasagne.updates.adam(G_loss, G_theta) ##### Writing Training Functions for the both the networks D_train_fn = th.function(inputs=[Dis_input, Gen_input, input_labels], outputs=[D_loss], updates=D_solver) G_train_fn = th.function(inputs=[Gen_input, input_labels], outputs=[G_loss], updates=G_solver) i = 0
input_data = T.ftensor3('input_data') input_mask = T.fmatrix('input_mask') target_data = T.imatrix('target_data') target_mask = T.fmatrix('target_mask') network_output = deep_prj_lstm_model_v1(input_var=input_data, mask_var=input_mask, num_inputs=input_dim, num_outputs=output_dim, num_layers=args.num_layers, num_units=args.num_units, num_prjs=args.num_prjs, grad_clipping=args.grad_clipping, dropout=args.dropout) network = network_output network_params = get_all_params(network, trainable=True) network_reg_params = get_all_params(network, trainable=True, regularizable=True) param_count = count_params(network, trainable=True) print('Number of parameters of the network: {:.2f}M'.format( float(param_count) / 1000000)) ###################### # reload model param # ###################### if args.reload_model: print('Loading model: {}'.format(args.reload_model)) with open(args.reload_model, 'rb') as f: [ pretrain_network_params_val, pretrain_update_params_val,
def train_class(ds, paths, funcs, cla, updates, param_arch, param_cost, param_updates, param_train): # creates a log file containing the training behaviour, # saves it to file formatter = logging.Formatter('%(asctime)s %(message)s', "%Y-%m-%d %H:%M:%S") logger = logging.getLogger('log_training') if 'start_from_epoch' in param_train: name_tmp = 'training_from_epoch=%04d.log' % (param_train['start_from_epoch']) else: name_tmp = 'training.log' path_tmp = os.path.join(paths['exp'], name_tmp) if not os.path.isfile(path_tmp): file_handler = logging.FileHandler(path_tmp, mode='w') else: raise Exception('[e] the log file file ', name_tmp, ' already exists!') file_handler.setFormatter(formatter) logger.addHandler(file_handler) # and shows it to screen console_handler = logging.StreamHandler() console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.setLevel(logging.INFO) logger.info("Training stats:") cost_train_avg = [] cla_train_avg = [] reg_train_avg = [] grad_norm_train_avg = [] acc_train_avg = [] cost_test_avg = [] grad_norm_test_avg = [] acc_test_avg = [] for i_ in xrange(param_train['n_epochs']): if 'start_from_epoch' in param_train: i = i_ + param_train['start_from_epoch'] else: i = i_ cost_train = [] cla_train = [] reg_train = [] cost_test = [] grad_norm_train = [] grad_norm_test = [] acc_train = [] acc_test = [] tic = time.time() for x in ds.train_iter(): tmp = funcs['train'](*x) cost_train.append(tmp[0]) cla_train.append(tmp[1]) reg_train.append(tmp[2]) grad_norm_train.append(tmp[3]) acc_train.append(tmp[4]) if ((i + 1) % param_train['freq_viz_train']) == 0: cost_train_avg.append(np.mean(cost_train)) cla_train_avg.append(np.mean(cla_train)) reg_train_avg.append(np.mean(reg_train)) grad_norm_train_avg.append(np.mean(grad_norm_train)) acc_train_avg.append(np.mean(acc_train)) string = "[TRN] epoch = %03i, cost = %3.2e (cla = %3.2e, reg = %3.2e), |grad| = %.2e, acc = %02.2f%% (%03.2fs)" % \ (i + 1, cost_train_avg[-1], cla_train_avg[-1], reg_train_avg[-1], grad_norm_train_avg[-1], 100.*acc_train_avg[-1], time.time() - tic) logger.info(string) if ((i + 1) % param_train['freq_viz_test']) == 0: tic = time.time() for x in ds.test_fwd(): tmp = funcs['fwd'](*x) cost_test.append(tmp[0]) grad_norm_test.append(tmp[1]) acc_test.append(tmp[2]) cost_test_avg.append(np.mean(cost_test)) grad_norm_test_avg.append(np.mean(grad_norm_test)) acc_test_avg.append(np.mean(acc_test)) string = "[TST] epoch = %03i, cost = %3.2e, |grad| = %.2e, acc = %02.2f%% (%03.2fs)" % \ (i + 1, cost_test_avg[-1], grad_norm_test_avg[-1], 100.*acc_test_avg[-1], time.time() - tic) logger.info(string) if param_train['flag_save_pkls']: if ((i + 1) % param_train['freq_save_pkls']) == 0: if not os.path.isdir(paths['pkls']): os.makedirs(paths['pkls']) name_dump = "%s/epoch=%04d.pkl" % (paths['pkls'], i + 1) keys_net = LL.get_all_params(cla) values_net = LL.get_all_param_values(cla, trainable=False) keys_updates = [k for k in updates.keys()] values_updates = [k.get_value() for k in updates.keys()] tmp = [paths, param_arch, param_cost, param_updates, param_train, cost_train_avg, acc_train_avg, cost_test_avg, acc_test_avg, keys_net, values_net, keys_updates, values_updates] with open(name_dump, 'wb') as f: cPickle.dump(tmp, f) if param_train['flag_save_preds']: if ((i + 1) % param_train['freq_save_preds']) == 0: for j, k in enumerate(ds.test_fwd()): path_dump = os.path.join(paths['preds'], "epoch=%04d" % (i + 1)) if not os.path.isdir(path_dump): os.makedirs(path_dump) name_dump = os.path.join(path_dump, ds.names_test[j]) tmp = funcs['pred'](*k) scipy.io.savemat(name_dump, {'pred': tmp[0]}) return cost_train_avg, acc_train_avg, cost_test_avg, acc_test_avg