Beispiel #1
0
	def get_params(self):
		'''returns a list of the trainable parameters, that is, the query
		and context embeddings.  (similar to layer.get_all_params.)'''
		return (
			get_all_params(self.l_embed_query, trainable=True) +
			get_all_params(self.l_embed_context, trainable=True)
		)
def test_memory_cells(batch_size=3, seq_len=50, input_dim=8, n_hidden=16):
    # lasagne way
    l_in = InputLayer((None, seq_len, input_dim),
                      input_var=theano.shared(np.random.normal(size=[batch_size, seq_len, input_dim])),
                      name='input seq')

    l_lstm0 = LSTMLayer(l_in, n_hidden, name='lstm')
    l_gru0 = GRULayer(l_in, n_hidden, name='gru')

    f_predict0 = theano.function([], get_output([l_lstm0, l_gru0]))

    # agentnet way
    s_in = InputLayer((None, input_dim), name='in')

    s_prev_cell = InputLayer((None, n_hidden), name='cell')
    s_prev_hid = InputLayer((None, n_hidden), name='hid')
    s_lstm_cell, s_lstm_hid = LSTMCell(s_prev_cell, s_prev_hid, s_in, name='lstm')

    s_prev_gru = InputLayer((None, n_hidden), name='hid')
    s_gru = GRUCell(s_prev_gru, s_in, name='gru')

    rec = Recurrence(state_variables=OrderedDict({
        s_lstm_cell: s_prev_cell,
        s_lstm_hid: s_prev_hid,
        s_gru: s_prev_gru}),
        input_sequences={s_in: l_in},
        unroll_scan=False)

    state_seqs, _ = rec.get_sequence_layers()

    l_lstm1 = state_seqs[s_lstm_hid]
    l_gru1 = state_seqs[s_gru]

    f_predict1 = theano.function([], get_output([l_lstm1, l_gru1]))

    # lstm param transfer
    old_params = sorted(get_all_params(l_lstm0, trainable=True), key=lambda p: p.name)
    new_params = sorted(get_all_params(s_lstm_hid, trainable=True), key=lambda p: p.name)

    for old, new in zip(old_params, new_params):
        print (old.name, '<-', new.name)
        assert tuple(old.shape.eval()) == tuple(new.shape.eval())
        old.set_value(new.get_value())

    # gru param transfer
    old_params = sorted(get_all_params(l_gru0, trainable=True), key=lambda p: p.name)
    new_params = sorted(get_all_params(s_gru, trainable=True), key=lambda p: p.name)

    for old, new in zip(old_params, new_params):
        print (old.name, '<-', new.name)
        assert tuple(old.shape.eval()) == tuple(new.shape.eval())
        old.set_value(new.get_value())

    lstm0_out, gru0_out = f_predict0()
    lstm1_out, gru1_out = f_predict1()

    assert np.allclose(lstm0_out, lstm1_out)
    assert np.allclose(gru0_out, gru1_out)
Beispiel #3
0
    def _compile(self):
        rc = self.rc

        # actor gradient step
        O = self.net.O
        V = ll.get_output(self.net.critic)
        params = self.net.actor_params
        regl_params = ll.get_all_params(self.net.actor, regularizable=True)
        regl = 0.5*rc['l2_actor']*tt.sum([tt.sum(p**2) for p in regl_params])
        updates = rc['gradient_updates'](V.mean()+regl, params, learning_rate=rc['lr_actor'])
        self.update_actor = th.function([O], [V.mean()], updates=updates)

        # critic bellman error (test version, doesn't update parameters)
        U = tt.matrix()
        Q = ll.get_output(self.net.critic, inputs={self.net.actor: U})
        Y = tt.matrix()
        J = 0.5*tt.mean((Y-Q)**2)
        self.J = th.function([O, U, Y], J)

        # critic bellman error (train version, does update parameters)
        regl_params = [p for p in ll.get_all_params(self.net.critic, regularizable=True)
                if p not in ll.get_all_params(self.net.actor)]
        regl = 0.5*rc['l2_critic']*tt.sum([tt.sum(p**2) for p in regl_params])
        params = self.net.critic_params
        updates = rc['gradient_updates'](J+regl, params, learning_rate=rc['lr_critic'])
        self.update_critic = th.function([O, U, Y], J, updates=updates)

        # target network update
        updates = []
        tau = rc['tau']
        for p,tgt_p in zip(self.net.all_params, self.target_net.all_params):
            updates.append( (tgt_p, tau*p + (1-tau)*tgt_p) )
        self.update_target = th.function([], [], updates=updates)

        # build cost function
        # TODO: handle this better through rc
        x = tt.vector()
        u = tt.vector()
        site_xpos = tt.matrix()

        # L2 costs
        c = 0.5*rc['l2_q']*tt.sum(x[:self.model['nq']]**2)
        c += 0.5*rc['l2_v']*tt.sum(x[-self.model['nv']:]**2)
        c += 0.5*rc['l2_u']*tt.sum(u**2)

        # Huber costs
        if rc['huber_site'] is not None:
            a = rc['huber_alpha']
            d = site_xpos[0] - site_xpos[1]
            c += rc['huber_site']*(tt.sqrt(tt.sum(d**2) + a**2) - a)

        # compile cost function
        # TODO: remove need for 'on_unused_input'
        self.cost = th.function([x, u, site_xpos], c, on_unused_input='ignore')
Beispiel #4
0
        def set_decoder_weights(decoder_1step):
            """
            set 1step weights equal to training decoder/probas_predictor weights
            """
            params_1step = get_all_params(decoder_1step)
            params_full = get_all_params(self.net['l_dist'])
            params_full_dict = {p.name: p for p in params_full}

            for param_1step in params_1step:
                # use Theano .get_value() and.set_value() methods, applied to the shared variables
                param_1step.set_value(params_full_dict[param_1step.name].get_value())
Beispiel #5
0
    def test_get_all_params(self):
        from lasagne.layers import (InputLayer, DenseLayer, get_all_params)
        l1 = InputLayer((10, 20))
        l2 = DenseLayer(l1, 30)
        l3 = DenseLayer(l2, 40)

        assert get_all_params(l3) == l2.get_params() + l3.get_params()
        assert (get_all_params(l3, regularizable=False) ==
                (l2.get_params(regularizable=False) +
                 l3.get_params(regularizable=False)))

        assert (get_all_params(l3, regularizable=True) ==
                (l2.get_params(regularizable=True) +
                 l3.get_params(regularizable=True)))
Beispiel #6
0
def build_optimizer(network, placeholders, optimization, learning_rate):

	# build loss function 
	
	if optimization['objective'] == 'lower_bound':
		if 'binary' in optimization:
			binary = optimization['binary']
		else:
			binary = False

		loss, prediction = variational_lower_bound(network, placeholders['inputs'], 
													deterministic=False, binary=binary)

		# regularize parameters
		loss += regularization(network['X'], optimization)	

		params = layers.get_all_params(network['X'], trainable=True)

	else:
		prediction = layers.get_output(network['output'], deterministic=False)
		loss = build_loss(placeholders['targets'], prediction, optimization)

		# regularize parameters
		loss += regularization(network['output'], optimization)

		params = layers.get_all_params(network['output'], trainable=True)    


	# calculate and clip gradients
	if "weight_norm" in optimization:
		weight_norm = optimization['weight_norm']
	else:
		weight_norm = None
	grad = calculate_gradient(loss, params, weight_norm=weight_norm)
	  
	# setup parameter updates
	update_op = build_updates(grad, params, optimization, learning_rate)

	# test/validation set 
	if optimization['objective'] == 'lower_bound':
		test_loss, test_prediction = variational_lower_bound(network, placeholders['inputs'], deterministic=False, binary=binary)	
	else:
		test_prediction = layers.get_output(network['output'], deterministic=True)
		test_loss = build_loss(placeholders['targets'], test_prediction, optimization)
			
	# create theano function
	train_fun = theano.function(list(placeholders.values()), [loss, prediction], updates=update_op)
	test_fun = theano.function(list(placeholders.values()), [test_loss, test_prediction])

	return train_fun, test_fun
def generate_theano_func(args, network, penalty, input_dict, target_var):

    prediction = get_output(network, input_dict)

    # loss = T.mean( target_var * ( T.log(target_var) - prediction ))
    loss = T.mean(categorical_crossentropy(prediction, target_var))
    # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) )
    # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params )
    # penalty = regularize_layer_params(l_forward_1_lstm, l2)
    # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params)
    # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) )

    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    test_prediction = get_output(network, input_dict, deterministic=True)
    # test_prediction = get_output(network, deterministic=True)
    # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction))
    test_loss = T.mean(categorical_crossentropy(test_prediction, target_var))

    train_fn = theano.function(
        [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
        loss,
        updates=updates,
        allow_input_downcast=True,
    )

    if args.task == "sts":
        val_fn = theano.function(
            [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
            [test_loss, test_prediction],
            allow_input_downcast=True,
        )

    elif args.task == "ent":
        # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)
        test_acc = T.mean(categorical_accuracy(test_prediction, target_var))
        val_fn = theano.function(
            [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
            [test_loss, test_acc],
            allow_input_downcast=True,
        )

    return train_fn, val_fn
Beispiel #8
0
def test_maxpool_layer():
    l_in1 = InputLayer((None, 2))
    l_in2 = InputLayer((None, 20))
    l_hid = DenseLayer(l_in2, num_units=30, nonlinearity=rectify)
    l_pool = MaxpoolLayer([l_in1, l_hid])
    l_out = DenseLayer(l_pool, num_units=1, nonlinearity=sigmoid)

    bounds = theano.tensor.lmatrix('bounds')
    data = theano.tensor.matrix('data')
    targets = theano.tensor.matrix('targets')

    predictions = get_output(l_out, {l_in1: bounds, l_in2: data})
    loss = categorical_crossentropy(predictions, targets)
    loss = aggregate(loss, mode='mean')

    params = get_all_params(l_out)
    updates_sgd = sgd(loss, params, learning_rate=0.0001)

    train_function = theano.function([bounds, data, targets], updates=updates_sgd, allow_input_downcast=True)

    test_bounds = np.array([[0, 3], [3, 5], [5, 7]])
    test_X = np.random.randn(10, 20)
    test_Y = np.array([[0], [1], [0]])

    train_function(test_bounds, test_X, test_Y)
Beispiel #9
0
    def init_model(self):
        print('Initializing model...')
        ra_input_var = T.tensor3('raw_audio_input')
        mc_input_var = T.tensor3('melody_contour_input')
        target_var = T.imatrix('targets')
        network = self.build_network(ra_input_var, mc_input_var)
        prediction = layers.get_output(network)
        prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7)
        loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
        loss = loss.mean()
        params = layers.get_all_params(network, trainable=True)
        updates = lasagne.updates.sgd(loss, params, learning_rate=0.02)

        test_prediction = layers.get_output(network, deterministic=True)
        test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                                target_var)
        test_loss = test_loss.mean()
        test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), T.argmax(target_var, axis=1)),
                          dtype=theano.config.floatX)

        print('Building functions...')
        self.train_fn = theano.function([ra_input_var, mc_input_var, target_var], 
                                        [loss, prediction], 
                                        updates=updates, 
                                        on_unused_input='ignore')
        self.val_fn = theano.function([ra_input_var, mc_input_var, target_var], 
                                        [test_loss, test_acc, test_prediction], 
                                        on_unused_input='ignore')
        self.run_fn = theano.function([ra_input_var, mc_input_var],
                                        [prediction],
                                        on_unused_input='ignore')
Beispiel #10
0
    def __init__(self, dims, nonlinearities=None, dropouts=None,
                 update_fn=None, batch_norm=False,
                 loss_type='cosine_margin', margin=0.8):
        """Initialize a Siamese neural network

        Parameters:
        -----------
        update_fn: theano function with 2 arguments (loss, params)
            Update scheme, default to adadelta
        batch_norm: bool
            Do batch normalisation on first layer, default to false
        """
        assert len(dims) >= 3, 'Not enough dimmensions'
        if dropouts != None:
            dropouts = copy.copy(dropouts)
            assert len(dropouts) == len(dims) - 1
            dropouts.append(0)
        else:
            dropouts = [0] * len(dims)
        if nonlinearities==None:
            nonlinearities = [nl.sigmoid] * (len(dims) -1)
        else:
            assert len(nonlinearities) == len(dims) - 1
        if update_fn == None:
            update_fn = lasagne.updates.adadelta
        self.input_var1 = T.matrix('inputs1')
        self.input_var2 = T.matrix('inputs2')
        self.target_var = T.ivector('targets')
        # input layer
        network1 = layers.InputLayer((None, dims[0]), input_var=self.input_var1)
        network2 = layers.InputLayer((None, dims[0]), input_var=self.input_var2)
        if dropouts[0]:
            network1 = layers.DropoutLayer(network1, p=dropouts[0])
            network2 = layers.DropoutLayer(network2, p=dropouts[0])
        # hidden layers
        for dim, dropout, nonlin in zip(dims[1:], dropouts[1:], nonlinearities):
            network1 = layers.DenseLayer(network1, num_units=dim,
                                         W=lasagne.init.GlorotUniform(),
                                         nonlinearity=nonlin)
            network2 = layers.DenseLayer(network2, num_units=dim,
                                         W=network1.W, b=network1.b,
                                         nonlinearity=nonlin)
            if batch_norm:
                network1 = layers.batch_norm(network1)
                network2 = layers.batch_norm(network2)
            if dropout:
                network1 = layers.DropoutLayer(network1, p=dropout)
                network2 = layers.DropoutLayer(network2, p=dropout)
        self.network = [network1, network2]
        self.params = layers.get_all_params(network1, trainable=True)

        # util functions, completely stolen from Lasagne example
        self.prediction1 = layers.get_output(network1)
        self.prediction2 = layers.get_output(network2)
        # if non-determnistic:
        self.test_prediction1 = layers.get_output(network1, deterministic=True)
        self.test_prediction2 = layers.get_output(network2, deterministic=True)

        self.change_loss(loss_type, margin)
        self.change_update(update_fn)
Beispiel #11
0
def parameter_analysis(layer):
    all_params = ll.get_all_param_values(layer, trainable=True)
    param_names = [p.name for p in ll.get_all_params(layer, trainable=True)]
    print_gradinfo(param_names, {'nneg':[np.count_nonzero(p < 0) / np.product(p.shape) for p in all_params],
                                 'norm':[np.linalg.norm(p) for p in all_params],
                                 'shape':[p.shape for p in all_params]})
    """
Beispiel #12
0
    def build_model(self, train_x, test_x, valid_x, update, update_args):
        self.train_x = train_x
        self.test_x = test_x
        self.validation_x = valid_x
        self.update = update
        self.update_args = update_args
        self.index = T.iscalar('index')
        self.batch_slice = slice(self.index * self.batch_size, (self.index + 1) * self.batch_size)

        x = self.srng.binomial(size=self.x.shape, n=1, p=self.x)
        log_pz, log_qz_given_x, log_px_given_z = self.model.get_log_distributions(self.x)
        loss_eval = (log_pz + log_px_given_z - log_qz_given_x).sum()
        loss_eval /= self.batch_size

        all_params = get_all_params(self.model)
        updates = self.update(-loss_eval, all_params, *self.update_args)

        train_model = theano.function([self.index], loss_eval, updates=updates,
                                      givens={self.x: self.train_x[self.batch_slice], },)

        test_model = theano.function([self.index], loss_eval,
                                     givens={self.x: self.test_x[self.batch_slice], },)

        validate_model = theano.function([self.index], loss_eval,
                                         givens={self.x: self.validation_x[self.batch_slice], },)

        return train_model, test_model, validate_model
Beispiel #13
0
	def makeRegressionNetwork(self,n_in,n_hidden,n_out,learning_rate=0.001):
		"""
			build a feedforward neural network with regression output
		"""
		#network input
		input_ = T.matrix('input_')  # matrix of shape batch size times number of input variables
		target_ = T.matrix('target_')# matrix of shape batch size times number of output variables

		#network
		l_input=layers.InputLayer((None,n_in))
		l_hid=layers.DenseLayer(l_input,num_units=n_hidden)
		self.l_out=layers.DenseLayer(l_hid,num_units=n_out,nonlinearity=None)

		#network output
		l_outvalue = layers.get_output(self.l_out, input_)
		self.predict=theano.function([input_],l_outvalue,allow_input_downcast=True)

		#loss/cost function
		loss = T.mean(lasagne.objectives.squared_error(l_outvalue, target_))

		#calculate the updates
		params = layers.get_all_params(self.l_out)
		updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=learning_rate, momentum=0.9)

		#update the weights from a given input and target
		self.train_function = theano.function([input_, target_],loss, updates=updates,allow_input_downcast=True)
Beispiel #14
0
    def _get_train_fun(self):
        output_probs = get_output(self.net['l_dist'])   # "long" 2d matrix with prob distribution

        input_ids = T.imatrix()
        # cut off the first ids from every id sequence: they correspond to START_TOKEN, that we are not predicting
        target_ids = input_ids[:, 1:]
        target_ids_flattened = target_ids.flatten()               # "long" vector with target ids

        cost = categorical_crossentropy(
            predictions=output_probs,
            targets=target_ids_flattened
        ).mean()

        all_params = get_all_params(self.net['l_dist'], trainable=True)

        print("Computing train updates...")
        updates = lasagne.updates.adadelta(
            loss_or_grads=cost,
            params=all_params,
            learning_rate=LEARNING_RATE
        )

        print("Compiling train function...")
        train_fun = theano.function(
            inputs=[self.net['l_in_x'].input_var, self.net['l_in_y'].input_var, input_ids],
            outputs=cost,
            updates=updates
        )

        return train_fun
Beispiel #15
0
    def test_get_all_params_with_unwrap_shared(self):
        from lasagne.layers import (InputLayer, DenseLayer, get_all_params)
        import theano.tensor as T
        from lasagne.utils import floatX

        l1 = InputLayer((10, 20))
        l2 = DenseLayer(l1, 30)

        W1 = theano.shared(floatX(numpy.zeros((30, 2))))
        W2 = theano.shared(floatX(numpy.zeros((2, 40))))
        W_expr = T.dot(W1, W2)
        l3 = DenseLayer(l2, 40, W=W_expr, b=None)

        l2_params = get_all_params(l2)
        assert get_all_params(l3) == l2_params + [W1, W2]
        assert get_all_params(l3, unwrap_shared=False) == l2_params + [W_expr]
Beispiel #16
0
def get_model(input_var, target_var, multiply_var):

    # input layer with unspecified batch size
    layer     = InputLayer(shape=(None, 12, 64, 64), input_var=input_var) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var)
    layer     = DimshuffleLayer(layer, (0, 'x', 1, 2, 3))

    # Z-score?

    # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = Conv3DDNNLayer(incoming=layer, num_filters=1, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=sigmoid)
    layer_prediction  = layer

    # Loss
    prediction           = get_output(layer_prediction)
    loss                 = binary_crossentropy(prediction[:,0,:,:,:], target_var).mean()

    #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum
    params               = get_all_params(layer_prediction, trainable=True)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network, disabling dropout layers.
    test_prediction      = get_output(layer_prediction, deterministic=True)
    test_loss            = binary_crossentropy(test_prediction[:,0,:,:,:], target_var).mean()

    return test_prediction, prediction, loss, params
Beispiel #17
0
    def __init__(self, sound_shape, num_units, main_layer_class, loss_func, updates_func):
        # входной тензор (кол-во батчей, кол-во записей, время, частота)
        input_X = T.tensor4("X")

        # сеть
        input_layer = InputLayer(shape=(None, 3) + sound_shape, input_var=input_X.swapaxes(2, 3))
        all_output = main_layer_class(input_layer, sound_shape, num_units)  # for loss
        vector_output = ReshapeLayer(all_output, (-1, 1, num_units))  # for use

        # предсказание нейронки
        all_predicted = get_output(all_output)  # for loss
        vector_predicted = get_output(vector_output)  # for use

        # функция ошибки
        loss = loss_func(all_predicted)

        # посчитать обновлённые веса с шагом по градиенту
        trainable_weights = get_all_params(all_output, trainable=True)
        updates_sgd = updates_func(loss, trainable_weights)

        # функция, которая обучает сеть на 1 шаг и возвращащет значение функции потерь
        self.fit = theano.function([input_X], loss, updates=updates_sgd)

        # функция, которая возвращает вектор голоса
        self.predict = theano.function([input_X], vector_predicted)

        self.all_output = all_output
        self.vector_output = vector_output
        self.all_predicted = all_predicted
        self.vector_predicted = vector_predicted
def create_encoder_decoder_func(layers, apply_updates=False):
    X = T.fmatrix('X')
    X_batch = T.fmatrix('X_batch')

    X_hat = get_output(layers['l_decoder_out'], X, deterministic=False)

    # reconstruction loss
    encoder_decoder_loss = T.mean(
        T.mean(T.sqr(X - X_hat), axis=1)
    )

    if apply_updates:
        # all layers that participate in the forward pass should be updated
        encoder_decoder_params = get_all_params(
            layers['l_decoder_out'], trainable=True)

        encoder_decoder_updates = nesterov_momentum(
            encoder_decoder_loss, encoder_decoder_params, 0.01, 0.9)
    else:
        encoder_decoder_updates = None

    encoder_decoder_func = theano.function(
        inputs=[theano.In(X_batch)],
        outputs=encoder_decoder_loss,
        updates=encoder_decoder_updates,
        givens={
            X: X_batch,
        },
    )

    return encoder_decoder_func
def create_iter_funcs_train(l_out, lr, mntm, wd):
    X = T.tensor4('X')
    y = T.ivector('y')
    X_batch = T.tensor4('X_batch')
    y_batch = T.ivector('y_batch')

    y_hat = layers.get_output(l_out, X, deterministic=False)

    # softmax loss
    train_loss = T.mean(
        T.nnet.categorical_crossentropy(y_hat, y))

    # L2 regularization
    train_loss += wd * regularize_network_params(l_out, l2)

    train_acc = T.mean(
        T.eq(y_hat.argmax(axis=1), y))

    all_params = layers.get_all_params(l_out, trainable=True)
    updates = lasagne.updates.nesterov_momentum(
        train_loss, all_params, lr, mntm)

    train_iter = theano.function(
        inputs=[theano.Param(X_batch), theano.Param(y_batch)],
        outputs=[train_loss, train_acc],
        updates=updates,
        givens={
            X: X_batch,
            y: y_batch,
        },
    )

    return train_iter
    def _create_nnet(input_dims, output_dims, learning_rate, num_hidden_units=15, batch_size=32, max_train_epochs=1,
                     hidden_nonlinearity=nonlinearities.rectify, output_nonlinearity=None, update_method=updates.sgd):
        """
        A subclass may override this if a different sort
        of network is desired.
        """
        nnlayers = []
        nnlayers.append(layers.InputLayer(shape=(None, input_dims)))
        nnlayers.append(layers.DenseLayer(nnlayers[-1], num_hidden_units, nonlinearity=hidden_nonlinearity))
        nnlayers.append(layers.DenseLayer(nnlayers[-1], output_dims, nonlinearity=output_nonlinearity))

        prediction = layers.get_output(nnlayers[-1])

        input_var = nnlayers[0].input_var
        target = T.matrix(name="target", dtype=floatX)

        loss = objectives.squared_error(prediction, target).mean()

        params = layers.get_all_params(nnlayers[-1], trainable=True)

        updates = update_method(loss, params, learning_rate)

        fit = theano.function([input_var, target], loss, updates=updates)

        predict = theano.function([input_var], prediction)

        nnet = Mock(
            fit=fit,
            predict=predict,
        )
        return nnet
Beispiel #21
0
def get_model(input_images, input_position, input_mult, target_var):

    # number of SAX and distance between SAX slices
    #indexes = []
    #for i in range(input_position.shape[0]):
    #    indexes.append(numpy.where(input_position[i][:,0] == 0.)[0][0])
    
    # input layer with unspecified batch size
    layer     = InputLayer(shape=(None, 22, 30, 64, 64), input_var=input_images) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var)
    
    # Z-score?

    # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = Conv3DDNNLayer(incoming=layer, num_filters=22, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=sigmoid)

    layer_max     = ExpressionLayer(layer, lambda X: X.max(1), output_shape='auto')
    layer_min     = ExpressionLayer(layer, lambda X: X.min(1), output_shape='auto')
    
    layer_prediction = layer
    # image prediction
    prediction           = get_output(layer_prediction)
        
    loss                 = binary_crossentropy(prediction, target_var).mean()

    #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum
    params               = get_all_params(layer_prediction, trainable=True)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network, disabling dropout layers.
    test_prediction      = get_output(layer_prediction, deterministic=True)
    test_loss            = binary_crossentropy(test_prediction, target_var).mean()

    return test_prediction, prediction, loss, params
def load_weights(layer, filename):
    with open(filename, 'rb') as f:
        src_params_list = pickle.load(f)

    dst_params_list = get_all_params(layer)
    # assign the parameter values stored on disk to the model
    for src_params, dst_params in zip(src_params_list, dst_params_list):
        dst_params.set_value(src_params)
Beispiel #23
0
def init_weights(l_out, init_file):
    print('loading weights from %s' % (init_file))
    with open(init_file, 'rb') as ifile:
        src_layers = pickle.load(ifile)
    dst_layers = layers.get_all_params(l_out)
    for i, (src_weights, dst_layer) in enumerate(
            zip(src_layers, dst_layers)):
        print('loading pretrained weights for %s' % (dst_layer.name))
        dst_layer.set_value(src_weights)
Beispiel #24
0
    def build_treatment_model(self, n_vars, **kwargs):

        input_vars = TT.matrix()
        instrument_vars = TT.matrix()
        targets = TT.vector()

        inputs = layers.InputLayer((None, n_vars), input_vars)
        inputs = layers.DropoutLayer(inputs, p=0.2)

        dense_layer = layers.DenseLayer(inputs, 2 * kwargs['dense_size'], nonlinearity=nonlinearities.rectify)
        dense_layer = layers.batch_norm(dense_layer)
        dense_layer= layers.DropoutLayer(dense_layer, p=0.2)

        for _ in xrange(kwargs['n_dense_layers'] - 1):
            dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.rectify)
            dense_layer = layers.batch_norm(dense_layer)

        self.treatment_output = layers.DenseLayer(dense_layer, 1, nonlinearity=nonlinearities.linear)
        init_params = layers.get_all_param_values(self.treatment_output)

        prediction = layers.get_output(self.treatment_output, deterministic=False)
        test_prediction = layers.get_output(self.treatment_output, deterministic=True)

        l2_cost = regularization.regularize_network_params(self.treatment_output, regularization.l2)
        loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost

        params = layers.get_all_params(self.treatment_output, trainable=True)
        param_updates = updates.adadelta(loss, params)

        self._train_fn = theano.function(
            [
                input_vars,
                targets,
                instrument_vars,
            ],
            loss,
            updates=param_updates
        )

        self._loss_fn = theano.function(
            [
                input_vars,
                targets,
                instrument_vars,
            ],
            loss,
        )

        self._output_fn = theano.function(
            [
                input_vars,
            ],
            test_prediction,
        )

        return init_params
def create_discriminator_func(layers, apply_updates=False):
    X = T.fmatrix('X')
    pz = T.fmatrix('pz')

    X_batch = T.fmatrix('X_batch')
    pz_batch = T.fmatrix('pz_batch')

    # the discriminator receives samples from q(z|x) and p(z)
    # and should predict to which distribution each sample belongs
    discriminator_outputs = get_output(
        layers['l_discriminator_out'],
        inputs={
            layers['l_prior_in']: pz,
            layers['l_encoder_in']: X,
        },
        deterministic=False,
    )

    # label samples from q(z|x) as 1 and samples from p(z) as 0
    discriminator_targets = T.vertical_stack(
        T.ones((X_batch.shape[0], 1)),
        T.zeros((pz_batch.shape[0], 1))
    )

    discriminator_loss = T.mean(
        T.nnet.binary_crossentropy(
            discriminator_outputs,
            discriminator_targets,
        )
    )

    if apply_updates:
        # only layers that are part of the discriminator should be updated
        discriminator_params = get_all_params(
            layers['l_discriminator_out'], trainable=True, discriminator=True)

        discriminator_updates = nesterov_momentum(
            discriminator_loss, discriminator_params, 0.1, 0.0)
    else:
        discriminator_updates = None

    discriminator_func = theano.function(
        inputs=[
            theano.In(X_batch),
            theano.In(pz_batch),
        ],
        outputs=discriminator_loss,
        updates=discriminator_updates,
        givens={
            X: X_batch,
            pz: pz_batch,
        },
    )

    return discriminator_func
Beispiel #26
0
def get_model(input_var, target_var, multiply_var):

    # input layer with unspecified batch size
    layer_input     = InputLayer(shape=(None, 30, 80, 80), input_var=input_var) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var)
    layer_0         = DimshuffleLayer(layer_input, (0, 'x', 1, 2, 3))

    # Z-score?

    # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer
    layer_1         = batch_norm(Conv3DDNNLayer(incoming=layer_0, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_2         = batch_norm(Conv3DDNNLayer(incoming=layer_1, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_3         = MaxPool3DDNNLayer(layer_2, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1))
    layer_4         = DropoutLayer(layer_3, p=0.25)

    # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer
    layer_5         = batch_norm(Conv3DDNNLayer(incoming=layer_4, num_filters=32, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_6         = batch_norm(Conv3DDNNLayer(incoming=layer_5, num_filters=32, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_7         = MaxPool3DDNNLayer(layer_6, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1))
    layer_8         = DropoutLayer(layer_7, p=0.25)
    
    # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer
    layer_5         = batch_norm(Conv3DDNNLayer(incoming=layer_8, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_6         = batch_norm(Conv3DDNNLayer(incoming=layer_5, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_7         = batch_norm(Conv3DDNNLayer(incoming=layer_6, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_8         = MaxPool3DDNNLayer(layer_7, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1))
    layer_9         = DropoutLayer(layer_8, p=0.25)

    # LSTM
    layer         = DimshuffleLayer(layer_9, (0,2,1,3,4))
#    layer_prediction  = LSTMLayer(layer, num_units=2, only_return_final=True, learn_init=True, cell=Gate(linear))
    layer = LSTMLayer(layer, num_units=2, only_return_final=True, learn_init=True)
    layer_prediction = DenseLayer(layer, 2, nonlinearity=linear)

    # Output Layer
    # layer_hidden         = DenseLayer(layer_flatten, 500, nonlinearity=linear)
    # layer_prediction     = DenseLayer(layer_hidden, 2, nonlinearity=linear)

    # Loss
    prediction           = get_output(layer_prediction) / multiply_var**2
    loss                 = T.abs_(prediction - target_var)
    loss                 = loss.mean()

    #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum
    params               = get_all_params(layer_prediction, trainable=True)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network, disabling dropout layers.
    test_prediction      = get_output(layer_prediction, deterministic=True) / multiply_var**2
    test_loss            = T.abs_(test_prediction - target_var)
    test_loss            = test_loss.mean()

    # crps estimate
    crps                 = T.abs_(test_prediction - target_var).mean()/600
    
    return test_prediction, crps, loss, params
Beispiel #27
0
def create_network(available_actions_num):
    # Creates the input variables
    s1 = tensor.tensor4("States")
    a = tensor.vector("Actions", dtype="int32")
    q2 = tensor.vector("Next State best Q-Value")
    r = tensor.vector("Rewards")
    nonterminal = tensor.vector("Nonterminal", dtype="int8")

    # Creates the input layer of the network.
    dqn = InputLayer(shape=[None, 1, downsampled_y, downsampled_x], input_var=s1)

    # Adds 3 convolutional layers, each followed by a max pooling layer.
    dqn = Conv2DLayer(dqn, num_filters=32, filter_size=[8, 8],
                      nonlinearity=rectify, W=GlorotUniform("relu"),
                      b=Constant(.1))
    dqn = MaxPool2DLayer(dqn, pool_size=[2, 2])
    dqn = Conv2DLayer(dqn, num_filters=64, filter_size=[4, 4],
                      nonlinearity=rectify, W=GlorotUniform("relu"),
                      b=Constant(.1))

    dqn = MaxPool2DLayer(dqn, pool_size=[2, 2])
    dqn = Conv2DLayer(dqn, num_filters=64, filter_size=[3, 3],
                      nonlinearity=rectify, W=GlorotUniform("relu"),
                      b=Constant(.1))
    dqn = MaxPool2DLayer(dqn, pool_size=[2, 2])
    # Adds a single fully connected layer.
    dqn = DenseLayer(dqn, num_units=512, nonlinearity=rectify, W=GlorotUniform("relu"),
                     b=Constant(.1))

    # Adds a single fully connected layer which is the output layer.
    # (no nonlinearity as it is for approximating an arbitrary real function)
    dqn = DenseLayer(dqn, num_units=available_actions_num, nonlinearity=None)

    # Theano stuff
    q = get_output(dqn)
    # Only q for the chosen actions is updated more or less according to following formula:
    # target Q(s,a,t) = r + gamma * max Q(s2,_,t+1)
    target_q = tensor.set_subtensor(q[tensor.arange(q.shape[0]), a], r + discount_factor * nonterminal * q2)
    loss = squared_error(q, target_q).mean()

    # Updates the parameters according to the computed gradient using rmsprop.
    params = get_all_params(dqn, trainable=True)
    updates = rmsprop(loss, params, learning_rate)

    # Compiles theano functions
    print "Compiling the network ..."
    function_learn = theano.function([s1, q2, a, r, nonterminal], loss, updates=updates, name="learn_fn")
    function_get_q_values = theano.function([s1], q, name="eval_fn")
    function_get_best_action = theano.function([s1], tensor.argmax(q), name="test_fn")
    print "Network compiled."

    # Returns Theano objects for the net and functions.
    # We wouldn't need the net anymore but it is nice to save your model.
    return dqn, function_learn, function_get_q_values, function_get_best_action
Beispiel #28
0
def build_model0(input_var,target_var,regularW=0,params_load=None):
    network=layers.InputLayer(shape=(None,3,256,256),input_var=input_var)
    # size 256*256
    network=layers.Pool2DLayer(network,pool_size=(2,2),stride=2,pad=0,mode='average_inc_pad')
    #size 128*128
    network=layers.Pool2DLayer(network,pool_size=(2,2),stride=2,pad=0,mode='average_inc_pad')
    #size 64*64
    network=layers.Conv2DLayer(network,num_filters=32,filter_size=(5,5),
                               nonlinearity=nonLinear.leaky_rectify,
                               W=init.GlorotUniform(gain='relu'),pad='same'
                               )
    
    network=layers.MaxPool2DLayer(network,pool_size=(2,2))
    network=layers.DropoutLayer(network,p=0.15)
    #size 32*32
    network=layers.Conv2DLayer(network,num_filters=64,filter_size=(5,5),
                               nonlinearity=nonLinear.leaky_rectify,
                               W=init.GlorotUniform(gain='relu'),pad='same'
                               )
    
    network=layers.MaxPool2DLayer(network,pool_size=(2,2))
    network=layers.DropoutLayer(network,p=0.2)
    #size 16*16
    network=layers.Conv2DLayer(network,num_filters=128,filter_size=(5,5),
                               nonlinearity=nonLinear.leaky_rectify,
                               W=init.GlorotUniform(gain='relu'),pad='same'
                               )
    
    network=layers.MaxPool2DLayer(network,pool_size=(2,2))
    network=layers.DropoutLayer(network,p=0.3)
    #size 8*8
    network=layers.Conv2DLayer(network,num_filters=256,filter_size=(5,5),
                               nonlinearity=nonLinear.leaky_rectify,
                               W=init.GlorotUniform(gain='relu'),pad='same'
                               )
    
    network=layers.MaxPool2DLayer(network,pool_size=(2,2))
    network=layers.DropoutLayer(network,p=0.4)
    #size 4*4
    network = layers.GlobalPoolLayer(network)    
    network=layers.DenseLayer(network,num_units=1000,
                              nonlinearity=nonLinear.leaky_rectify,
                              W=init.GlorotUniform(gain='relu'))
    network=layers.DenseLayer(network,num_units=2,
                              nonlinearity=nonLinear.softmax)
    prediction=layers.get_output(network)
    loss = objectives.categorical_crossentropy(prediction, target_var)
    loss=loss.mean()
    
    params=layers.get_all_params(network,trainable=True)        
    if params_load != None:
        [p.set_value(pval) for (p, pval) in zip(params, params_load)]
    
    return network,loss,params
Beispiel #29
0
def triplet_loss_iter(embedder, update_params={}):
    X_triplets = {
            'anchor':T.tensor4(),
            'positive':T.tensor4(),
            'negative':T.tensor4(),
            } # each will be a batch of images

    final_emb_layer = embedder[-1]
    all_layers = ll.get_all_layers(embedder)
    imwrite_architecture(all_layers, './layer_rep.png')
    # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred)
    # another assumption (which must hold when the network is being made)
    # the last prediction layer is a) the end of the network and b) what we ultimately care about
    # however the other prediction layers will be incorporated into the training loss
    predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_triplets.items()}
    predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_triplets.items()}

    # each output should be batch_size x embed_size

    # should give us a vector of batch_size of distances btw anchor and positive
    alpha = 0.2 # FaceNet alpha
    triplet_pos = lambda pred: (pred['anchor'] - pred['positive']).norm(2,axis=1)
    triplet_neg = lambda pred: (pred['anchor'] - pred['negative']).norm(2,axis=1)
    triplet_distances = lambda pred: (triplet_pos(pred) - triplet_neg(pred) + alpha).clip(0, np.inf)
    triplet_failed = lambda pred: T.mean(triplet_distances(pred) > alpha)
    triplet_loss = lambda pred: T.sum(triplet_distances(pred))

    decay = 0.001
    reg = regularize_network_params(final_emb_layer, l2) * decay
    losses_reg = lambda pred: triplet_loss(pred) + reg
    loss_train = losses_reg(predicted_embeds_train)
    loss_train.name = 'TL' # for the names
    #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder]))
    all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots'
    grads = T.grad(loss_train, all_params, add_names=True)
    updates = adam(grads, all_params)
    #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum'])

    print("Compiling network for training")
    tic = time.time()
    train_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [loss_train] + grads, updates=updates)
    toc = time.time() - tic
    print("Took %0.2f seconds" % toc)
    #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True)
    print("Compiling network for validation")
    tic = time.time()
    valid_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [triplet_loss(predicted_embeds_valid),
                                                                                                          losses_reg(predicted_embeds_valid),
                                                                                                          triplet_failed(predicted_embeds_valid)])
    toc = time.time() - tic
    print("Took %0.2f seconds" % toc)

    return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
Beispiel #30
0
def regularization(network, optimization):
	all_params = layers.get_all_params(network, regularizable=True)    

	# weight-decay regularization
	loss = 0
	if "l1" in optimization:
		l1_penalty = apply_penalty(all_params, l1) * optimization["l1"]
		loss += l1_penalty
	if "l2" in optimization:
		l2_penalty = apply_penalty(all_params, l2)* optimization["l2"]        
		loss += l2_penalty 
	return loss
Beispiel #31
0
    def init_mdn(self, svi=False, n_components=1, rank=None,
                 mdn_actfun=lnl.tanh, homoscedastic=False, min_precisions=None,
                 **unused_kwargs):
        """
        :param svi: bool
            Whether to use SVI version or not
        :param n_components: int
        :param rank: int
        :param homoscedastic: bool
        :param unused_kwargs: dict
        :param mdn_actfun: lasagne nonlinearity
            activation function for hidden units
        :param min_precisions: minimum values for diagonal elements of precision
            matrix for all components (usually taken to be prior precisions)
        :return: None
        """
        self.svi, self.n_components, self.rank, self.mdn_actfun,\
            self.homoscedastic, self.min_precisions = \
            svi, n_components, rank, mdn_actfun, homoscedastic, min_precisions
        for key in unused_kwargs.keys():
            print("MDN ignoring unused input {0}".format(key))

        # hidden layers
        for l in range(len(self.n_hiddens)):
            self.layer['hidden_' + str(l + 1)] = dl.FullyConnectedLayer(
                last(self.layer), n_units=self.n_hiddens[l],
                actfun=self.mdn_actfun,
                svi=self.svi, name='h' + str(l + 1))

        last_hidden = last(self.layer)
        # mixture layers
        self.layer['mixture_weights'] = dl.MixtureWeightsLayer(last_hidden,
            n_units=self.n_components, actfun=lnl.softmax, svi=self.svi,
            name='weights')
        self.layer['mixture_means'] = dl.MixtureMeansLayer(last_hidden,
            n_components=self.n_components, n_dim=self.n_outputs, svi=self.svi,
            name='means')
        if self.homoscedastic:
            PrecisionsLayer = dl.MixtureHomoscedasticPrecisionsLayer
        else:
            PrecisionsLayer = dl.MixturePrecisionsLayer
        # why is homoscedastic an input to the layer init?
        self.layer['mixture_precisions'] = PrecisionsLayer(last_hidden,
            n_components=self.n_components, n_dim=self.n_outputs, svi=self.svi,
            name='precisions', rank=self.rank, homoscedastic=self.homoscedastic,
            min_precisions=min_precisions)

        last_mog = [self.layer['mixture_weights'],
                    self.layer['mixture_means'],
                    self.layer['mixture_precisions']]

        # mixture parameters
        # a : weights, matrix with shape (batch, n_components)
        # ms : means, list of len n_components with (batch, n_dim, n_dim)
        # Us : precision factors, n_components list with (batch, n_dim, n_dim)
        # ldetUs : log determinants of precisions, n_comp list with (batch, )
        self.a, self.ms, precision_out = ll.get_output(last_mog,
                                                       deterministic=False)
        self.Us = precision_out['Us']
        self.ldetUs = precision_out['ldetUs']
        self.comps = {
            **{'a': self.a},
            **{'m' + str(i): self.ms[i] for i in range(self.n_components)},
            **{'U' + str(i): self.Us[i] for i in range(self.n_components)}}

        # log probability of y given the mixture distribution
        # lprobs_comps : log probs per component, list of len n_components with (batch, )
        # probs : log probs of mixture, (batch, )

        self.lprobs_comps = [-0.5 * tt.sum(tt.sum((self.params - m).dimshuffle(
            [0, 'x', 1]) * U, axis=2)**2, axis=1) + ldetU
            for m, U, ldetU in zip(self.ms, self.Us, self.ldetUs)]
        self.lprobs = (MyLogSumExp(tt.stack(self.lprobs_comps, axis=1) + tt.log(self.a), axis=1)
                       - (0.5 * self.n_outputs * np.log(2 * np.pi))).squeeze()

        # the quantities from above again, but with deterministic=True
        # --- in the svi case, this will disable injection of randomness;
        # the mean of weights is used instead
        self.da, self.dms, dprecision_out = ll.get_output(last_mog,
                                                          deterministic=True)
        self.dUs = dprecision_out['Us']
        self.dldetUs = dprecision_out['ldetUs']
        self.dcomps = {
            **{'a': self.da},
            **{'m' + str(i): self.dms[i] for i in range(self.n_components)},
            **{'U' + str(i): self.dUs[i] for i in range(self.n_components)}}

        self.dlprobs_comps = [-0.5 * tt.sum(tt.sum((self.params - m).dimshuffle(
            [0, 'x', 1]) * U, axis=2)**2, axis=1) + ldetU
            for m, U, ldetU in zip(self.dms, self.dUs, self.dldetUs)]
        self.dlprobs = (MyLogSumExp(tt.stack(self.dlprobs_comps, axis=1) + tt.log(self.da), axis=1) \
                        - (0.5 * self.n_outputs * np.log(2 * np.pi))).squeeze()

        # parameters of network
        self.aps = ll.get_all_params(last_mog)  # all parameters
        self.mps = ll.get_all_params(last_mog, mp=True)  # means
        self.sps = ll.get_all_params(last_mog, sp=True)  # log stds

        # weight and bias parameter sets as separate lists
        self.mps_wp = ll.get_all_params(last_mog, mp=True, wp=True)
        self.sps_wp = ll.get_all_params(last_mog, sp=True, wp=True)
        self.mps_bp = ll.get_all_params(last_mog, mp=True, bp=True)
        self.sps_bp = ll.get_all_params(last_mog, sp=True, bp=True)
    def build_model(self, train_set, test_set, validation_set=None):
        super(CNN, self).build_model(train_set, test_set, validation_set)

        epsilon = 1e-8
        y_train = T.clip(get_output(self.model, self.sym_x), epsilon, 1)
        loss_cc = aggregate(categorical_crossentropy(y_train, self.sym_t),
                            mode='mean')
        loss_train_acc = categorical_accuracy(y_train, self.sym_t).mean()

        y = T.clip(get_output(self.model, self.sym_x, deterministic=True),
                   epsilon, 1)
        loss_eval = aggregate(categorical_crossentropy(y, self.sym_t),
                              mode='mean')
        loss_acc = categorical_accuracy(y, self.sym_t).mean()

        all_params = get_all_params(self.model, trainable=True)
        sym_beta1 = T.scalar('beta1')
        sym_beta2 = T.scalar('beta2')
        grads = T.grad(loss_cc, all_params)
        grads = [T.clip(g, -5, 5) for g in grads]
        updates = rmsprop(grads, all_params, self.sym_lr, sym_beta1, sym_beta2)

        inputs = [
            self.sym_index, self.sym_batchsize, self.sym_lr, sym_beta1,
            sym_beta2
        ]
        f_train = theano.function(
            inputs,
            [loss_cc, loss_train_acc],
            updates=updates,
            givens={
                self.sym_x: self.sh_train_x[self.batch_slice],
                self.sym_t: self.sh_train_t[self.batch_slice],
            },
        )

        f_test = theano.function(
            [self.sym_index, self.sym_batchsize],
            [loss_eval, loss_acc],
            givens={
                self.sym_x: self.sh_test_x[self.batch_slice],
                self.sym_t: self.sh_test_t[self.batch_slice],
            },
        )

        f_validate = None
        if validation_set is not None:
            f_validate = theano.function(
                [self.sym_index, self.sym_batchsize],
                [loss_eval, loss_acc],
                givens={
                    self.sym_x: self.sh_valid_x[self.batch_slice],
                    self.sym_t: self.sh_valid_t[self.batch_slice],
                },
            )

        self.train_args['inputs']['batchsize'] = 128
        self.train_args['inputs']['learningrate'] = 1e-3
        self.train_args['inputs']['beta1'] = 0.9
        self.train_args['inputs']['beta2'] = 0.999
        self.train_args['outputs']['loss_cc'] = '%0.6f'
        self.train_args['outputs']['loss_train_acc'] = '%0.6f'

        self.test_args['inputs']['batchsize'] = 128
        self.test_args['outputs']['loss_eval'] = '%0.6f'
        self.test_args['outputs']['loss_acc'] = '%0.6f'

        self.validate_args['inputs']['batchsize'] = 128
        # self.validate_args['outputs']['loss_eval'] = '%0.6f'
        # self.validate_args['outputs']['loss_acc'] = '%0.6f'
        return f_train, f_test, f_validate, self.train_args, self.test_args, self.validate_args
Beispiel #33
0
    def __init__(
            self,
            disc_window,
            disc_joints_dim,
            iteration,
            a_max=0.7,
            a_min=0.0,
            batch_size = 64,
            iter_per_train = 10,
            decent_portion=0.8,
            hidden_sizes=(32, 32),
            hidden_nonlinearity=NL.tanh,
            output_nonlinearity=NL.tanh,
            disc_network=None,
    ):  
        self.batch_size=64
        self.iter_per_train=10
        self.disc_window = disc_window
        self.disc_joints_dim = disc_joints_dim
        self.disc_dim = self.disc_window*self.disc_joints_dim
        self.end_iter = int(iteration*decent_portion)
        self.iter_count = 0
        out_dim = 1
        target_var = TT.ivector('targets')

        # create network
        if disc_network is None:
            disc_network = MLP(
                input_shape=(self.disc_dim,),
                output_dim=out_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=output_nonlinearity,
            )

        self._disc_network = disc_network

        disc_reward = disc_network.output_layer
        obs_var = disc_network.input_layer.input_var

        disc_var, = L.get_output([disc_reward])

        self._disc_var = disc_var

        LasagnePowered.__init__(self, [disc_reward])
        self._f_disc = ext.compile_function(
            inputs=[obs_var],
            outputs=[disc_var],
            log_name="f_discriminate_forward",
        )
        
        params = L.get_all_params(disc_network, trainable=True)
        loss = lasagne.objectives.categorical_crossentropy(disc_var, target_var).mean()
        updates = lasagne.updates.adam(loss, params, learning_rate=0.01)
        self._f_disc_train = ext.compile_function(
            inputs=[obs_var, target_var],
            outputs=[loss],
            updates=updates,
            log_name="f_discriminate_train"
        )

        self.data = self.load_data()
        self.a = np.linspace(a_min, a_max, self.end_iter)
    def __init__(self, K, vocab_size, num_chars, W_init, regularizer, rlambda,
                 nhidden, embed_dim, dropout, train_emb, subsample, char_dim,
                 use_feat):
        self.nhidden = nhidden
        self.embed_dim = embed_dim
        self.dropout = dropout
        self.train_emb = train_emb
        self.subsample = subsample
        self.char_dim = char_dim
        self.learning_rate = LEARNING_RATE
        self.num_chars = num_chars
        self.use_feat = use_feat

        norm = lasagne.regularization.l2 if regularizer == 'l2' else lasagne.regularization.l1
        self.use_chars = self.char_dim != 0
        if W_init is None:
            W_init = lasagne.init.GlorotNormal().sample(
                (vocab_size, self.embed_dim))

        doc_var, query_var, cand_var = T.itensor3('doc'), T.itensor3('quer'), \
                T.wtensor3('cand')
        docmask_var, qmask_var, candmask_var = T.bmatrix('doc_mask'), T.bmatrix('q_mask'), \
                T.bmatrix('c_mask')
        target_var = T.ivector('ans')
        feat_var = T.imatrix('feat')
        doc_toks, qry_toks = T.imatrix('dchars'), T.imatrix('qchars')
        tok_var, tok_mask = T.imatrix('tok'), T.bmatrix('tok_mask')
        cloze_var = T.ivector('cloze')
        self.inps = [
            doc_var, doc_toks, query_var, qry_toks, cand_var, target_var,
            docmask_var, qmask_var, tok_var, tok_mask, candmask_var, feat_var,
            cloze_var
        ]

        if rlambda > 0.:
            W_pert = W_init + lasagne.init.GlorotNormal().sample(W_init.shape)
        else:
            W_pert = W_init
        self.predicted_probs, predicted_probs_val, self.doc_net, self.q_net, W_emb = (
            self.build_network(K, vocab_size, W_pert))

        self.loss_fn = T.nnet.categorical_crossentropy(self.predicted_probs, target_var).mean() + \
                rlambda*norm(W_emb-W_init)
        self.eval_fn = lasagne.objectives.categorical_accuracy(
            self.predicted_probs, target_var).mean()

        loss_fn_val = T.nnet.categorical_crossentropy(predicted_probs_val, target_var).mean() + \
                rlambda*norm(W_emb-W_init)
        eval_fn_val = lasagne.objectives.categorical_accuracy(
            predicted_probs_val, target_var).mean()

        self.params = L.get_all_params([self.doc_net] + self.q_net,
                                       trainable=True)

        updates = lasagne.updates.adam(self.loss_fn,
                                       self.params,
                                       learning_rate=self.learning_rate)

        self.train_fn = theano.function(
            self.inps, [self.loss_fn, self.eval_fn, self.predicted_probs],
            updates=updates,
            on_unused_input='warn')
        self.validate_fn = theano.function(
            self.inps, [loss_fn_val, eval_fn_val, predicted_probs_val],
            on_unused_input='warn')
Beispiel #35
0
def build_network_from_ae(classn):
    input_var = T.tensor4('input_var')

    layer = layers.InputLayer(shape=(None, 3, PS, PS), input_var=input_var)
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           100,
                           filter_size=(5, 5),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           120,
                           filter_size=(5, 5),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    layer = layers.Pool2DLayer(layer,
                               pool_size=(2, 2),
                               stride=2,
                               mode='average_inc_pad')
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           240,
                           filter_size=(3, 3),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           320,
                           filter_size=(3, 3),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    layer = layers.Pool2DLayer(layer,
                               pool_size=(2, 2),
                               stride=2,
                               mode='average_inc_pad')
    layer = batch_norm(
        layers.Conv2DLayer(layer,
                           640,
                           filter_size=(3, 3),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    prely = batch_norm(
        layers.Conv2DLayer(layer,
                           1024,
                           filter_size=(3, 3),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))

    featm = batch_norm(
        layers.Conv2DLayer(prely,
                           640,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    feat_map = batch_norm(
        layers.Conv2DLayer(featm,
                           100,
                           filter_size=(1, 1),
                           nonlinearity=rectify,
                           name="feat_map"))
    maskm = batch_norm(
        layers.Conv2DLayer(prely,
                           100,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    mask_rep = batch_norm(layers.Conv2DLayer(maskm,
                                             1,
                                             filter_size=(1, 1),
                                             nonlinearity=None),
                          beta=None,
                          gamma=None)
    mask_map = SoftThresPerc(mask_rep,
                             perc=0.0,
                             alpha=0.1,
                             beta=init.Constant(0.5),
                             tight=100.0,
                             bias=-10,
                             name="mask_map")
    enlyr = ChInnerProdMerge(feat_map, mask_map, name="encoder")

    layer = batch_norm(
        layers.Deconv2DLayer(enlyr,
                             1024,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             640,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             640,
                             filter_size=(4, 4),
                             stride=2,
                             crop=(1, 1),
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             320,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             320,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             240,
                             filter_size=(4, 4),
                             stride=2,
                             crop=(1, 1),
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             120,
                             filter_size=(5, 5),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = batch_norm(
        layers.Deconv2DLayer(layer,
                             100,
                             filter_size=(5, 5),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    layer = layers.Deconv2DLayer(layer,
                                 3,
                                 filter_size=(1, 1),
                                 stride=1,
                                 crop='same',
                                 nonlinearity=identity)

    glblf = batch_norm(
        layers.Conv2DLayer(prely,
                           128,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    glblf = layers.Pool2DLayer(glblf,
                               pool_size=(5, 5),
                               stride=5,
                               mode='average_inc_pad')
    glblf = batch_norm(
        layers.Conv2DLayer(glblf,
                           64,
                           filter_size=(3, 3),
                           stride=1,
                           pad='same',
                           nonlinearity=leaky_rectify))
    gllyr = batch_norm(layers.Conv2DLayer(glblf,
                                          5,
                                          filter_size=(1, 1),
                                          nonlinearity=rectify),
                       name="global_feature")

    glblf = batch_norm(
        layers.Deconv2DLayer(gllyr,
                             256,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             128,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             128,
                             filter_size=(9, 9),
                             stride=5,
                             crop=(2, 2),
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             128,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             128,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             64,
                             filter_size=(4, 4),
                             stride=2,
                             crop=(1, 1),
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             64,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             64,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             32,
                             filter_size=(4, 4),
                             stride=2,
                             crop=(1, 1),
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             32,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = batch_norm(
        layers.Deconv2DLayer(glblf,
                             32,
                             filter_size=(3, 3),
                             stride=1,
                             crop='same',
                             nonlinearity=leaky_rectify))
    glblf = layers.Deconv2DLayer(glblf,
                                 3,
                                 filter_size=(1, 1),
                                 stride=1,
                                 crop='same',
                                 nonlinearity=identity)

    layer = layers.ElemwiseSumLayer([layer, glblf])

    network = ReshapeLayer(layer, ([0], -1))
    layers.set_all_param_values(network,
                                pickle.load(open(filename_model_ae, 'rb')))
    mask_map.beta.set_value(np.float32(-10.0 * mask_map.beta.get_value()))
    old_params = layers.get_all_params(network, trainable=True)

    # Adding more layers
    aug_var = T.matrix('aug_var')
    target_var = T.imatrix('targets')
    add_a = batch_norm(
        layers.Conv2DLayer(enlyr,
                           320,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    add_b = batch_norm(
        layers.Conv2DLayer(add_a,
                           320,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    add_c = batch_norm(
        layers.Conv2DLayer(add_b,
                           320,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    add_d = batch_norm(
        layers.Conv2DLayer(add_c,
                           320,
                           filter_size=(1, 1),
                           nonlinearity=leaky_rectify))
    add_0 = layers.Pool2DLayer(add_d,
                               pool_size=(15, 15),
                               stride=15,
                               mode='average_inc_pad')
    add_1 = batch_norm(
        layers.DenseLayer(add_0, 100, nonlinearity=leaky_rectify))

    add_2 = batch_norm(
        layers.DenseLayer(gllyr, 320, nonlinearity=leaky_rectify))
    add_3 = batch_norm(
        layers.DenseLayer(add_2, 320, nonlinearity=leaky_rectify))
    add_4 = batch_norm(
        layers.DenseLayer(add_3, 100, nonlinearity=leaky_rectify))

    aug_layer = layers.InputLayer(shape=(None, aug_fea_n), input_var=aug_var)

    cat_layer = lasagne.layers.ConcatLayer([add_1, add_4, aug_layer], axis=1)

    hidden_layer = layers.DenseLayer(cat_layer, 80, nonlinearity=leaky_rectify)
    network = layers.DenseLayer(hidden_layer, classn, nonlinearity=sigmoid)

    all_params = layers.get_all_params(network, trainable=True)
    new_params = [x for x in all_params if x not in old_params]

    return network, new_params, input_var, aug_var, target_var
Beispiel #36
0
 def get_all_params(self):
     return L.get_all_params(self.network, trainable=True)
Beispiel #37
0
    return XX, XY, label, dx, dy


frame, targets = T.tensor4(), T.tensor4()
net = ll.InputLayer((None,2,100,100),input_var=frame)
net = ll.Conv2DLayer(net,32,(5,5),b=None,pad='same')
net = ll.Pool2DLayer(net,(2,2), mode='average_inc_pad')
net = ll.Conv2DLayer(net,8,(3,3),b=None,pad='same',nonlinearity=l.nonlinearities.LeakyRectify(0.1))
net = ll.Pool2DLayer(net,(2,2), mode='average_inc_pad')
net = ll.DenseLayer(net,625,b=None,nonlinearity=None)
net = ll.ReshapeLayer(net,([0],1,25,25))
predict = ll.get_output(net)
targets_pool = pool_2d(targets, ds=(4,4), mode='average_inc_pad')


loss = T.mean((predict-targets_pool)**2)
params = ll.get_all_params(net,trainable=True)
updates = l.updates.adam(loss,params,0.01)

train_f = theano.function([frame,targets],[loss,predict],updates=updates)
data = premnist()
errlist = []
for i in range(6000):
    x, y, move, label = mnist_data(data,(32,1,100,100),noise=None,heatmap=True,down=1)
    xx, xy = fftprocess(x,y)
    err, result = train_f(np.concatenate((xx,xy),axis=1),label)
    errlist.append(err)
    if (i+1)%10==0:
        print i+1,err
np.savez('toymodel.npz',*ll.get_all_param_values(net))
Beispiel #38
0
    target_data = T.imatrix('target_data')
    target_mask = T.fmatrix('target_mask')
    network_output, cond_layer_list = deep_projection_cond_ln_model(
        input_var=input_data,
        mask_var=input_mask,
        num_inputs=input_dim,
        num_outputs=output_dim,
        num_layers=args.num_layers,
        num_conds=args.num_conds,
        num_factors=args.num_factors,
        num_units=args.num_units,
        grad_clipping=args.grad_clipping,
        dropout=args.dropout)

    network = network_output
    network_params = get_all_params(network, trainable=True)
    param_count = count_params(network, trainable=True)
    print('Number of parameters of the network: {:.2f}M'.format(
        float(param_count) / 1000000))

    ######################
    # reload model param #
    ######################
    if args.reload_model:
        print('Loading model: {}'.format(args.reload_model))
        with open(args.reload_model, 'rb') as f:
            [
                pretrain_network_params_val, pretrain_update_params_val,
                pretrain_total_epoch_cnt
            ] = pickle.load(f)
        set_model_param_value(network_params, pretrain_network_params_val)
    def __init__(self,
                 n_in,
                 n_filters,
                 filter_sizes,
                 n_out,
                 pool_sizes=None,
                 n_hidden=(512),
                 ccf=False,
                 trans_func=rectify,
                 out_func=softmax,
                 dense_dropout=0.0,
                 stats=2,
                 input_noise=0.0,
                 batch_norm=False,
                 conv_dropout=0.0):
        super(CNN, self).__init__(n_in, n_hidden, n_out, trans_func)
        self.outf = out_func
        self.log = ""

        # Define model using lasagne framework
        dropout = True if not dense_dropout == 0.0 else False

        # Overwrite input layer
        sequence_length, n_features = n_in
        self.l_in = InputLayer(shape=(None, sequence_length, n_features))
        l_prev = self.l_in

        # Separate into raw values and statistics
        sequence_length -= stats
        stats_layer = SliceLayer(l_prev,
                                 indices=slice(sequence_length, None),
                                 axis=1)
        stats_layer = ReshapeLayer(stats_layer, (-1, stats * n_features))
        print('Stats layer shape', stats_layer.output_shape)
        l_prev = SliceLayer(l_prev, indices=slice(0, sequence_length), axis=1)
        print('Conv input layer shape', l_prev.output_shape)

        # Apply input noise
        l_prev = GaussianNoiseLayer(l_prev, sigma=input_noise)

        if ccf:
            self.log += "\nAdding cross-channel feature layer"
            l_prev = ReshapeLayer(l_prev, (-1, 1, sequence_length, n_features))
            l_prev = Conv2DLayer(l_prev,
                                 num_filters=4 * n_features,
                                 filter_size=(1, n_features),
                                 nonlinearity=None)
            n_features *= 4
            if batch_norm:
                l_prev = batch_norm_layer(l_prev)
            l_prev = ReshapeLayer(l_prev, (-1, n_features, sequence_length))
            l_prev = DimshuffleLayer(l_prev, (0, 2, 1))

        # 2D Convolutional layers
        l_prev = ReshapeLayer(l_prev, (-1, 1, sequence_length, n_features))
        l_prev = DimshuffleLayer(l_prev, (0, 3, 2, 1))

        # Add the convolutional filters
        for n_filter, filter_size, pool_size in zip(n_filters, filter_sizes,
                                                    pool_sizes):
            self.log += "\nAdding 2D conv layer: %d x %d" % (n_filter,
                                                             filter_size)
            l_prev = Conv2DLayer(l_prev,
                                 num_filters=n_filter,
                                 filter_size=(filter_size, 1),
                                 nonlinearity=self.transf,
                                 pad=filter_size // 2)
            if batch_norm:
                l_prev = batch_norm_layer(l_prev)
            if pool_size > 1:
                self.log += "\nAdding max pooling layer: %d" % pool_size
                l_prev = Pool2DLayer(l_prev, pool_size=(pool_size, 1))
            self.log += "\nAdding dropout layer: %.2f" % conv_dropout
            l_prev = TiedDropoutLayer(l_prev, p=conv_dropout)
            print("Conv out shape", get_output_shape(l_prev))

        # Global pooling layer
        l_prev = GlobalPoolLayer(l_prev,
                                 pool_function=T.mean,
                                 name='Global Mean Pool')
        print("GlobalPoolLayer out shape", get_output_shape(l_prev))

        # Concatenate stats
        l_prev = ConcatLayer((l_prev, stats_layer), axis=1)

        for n_hid in n_hidden:
            self.log += "\nAdding dense layer with %d units" % n_hid
            print("Dense input shape", get_output_shape(l_prev))
            l_prev = DenseLayer(l_prev, n_hid, init.GlorotNormal(),
                                init.Normal(1e-3), self.transf)
            if batch_norm:
                l_prev = batch_norm_layer(l_prev)
            if dropout:
                self.log += "\nAdding dense dropout with probability: %.2f" % dense_dropout
                l_prev = DropoutLayer(l_prev, p=dense_dropout)

        if batch_norm:
            self.log += "\nUsing batch normalization"

        self.model = DenseLayer(l_prev, num_units=n_out, nonlinearity=out_func)
        self.model_params = get_all_params(self.model)

        self.sym_x = T.tensor3('x')
        self.sym_t = T.matrix('t')
def multi_task_classifier(args,
                          input_var,
                          target_var,
                          wordEmbeddings,
                          seqlen,
                          num_feats,
                          lambda_val=0.5 * 1e-4):

    print("Building multi task model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen - kw + 1
    stride = 1
    filter_size = wordDim
    pool_size = num_filters

    input = InputLayer((None, seqlen, num_feats), input_var=input_var)
    batchsize, _, _ = input.input_var.shape

    #span
    emb1 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape1 = ReshapeLayer(emb1, (batchsize, seqlen, num_feats * wordDim))
    conv1d_1 = DimshuffleLayer(
        Conv1DLayer(reshape1,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_1 = MaxPool1DLayer(conv1d_1, pool_size=pool_size)
    hid_1 = DenseLayer(maxpool_1,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_1 = DenseLayer(hid_1, num_units=2, nonlinearity=softmax)
    """
    #DocTimeRel
    emb2 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    reshape2 = ReshapeLayer(emb2, (batchsize, seqlen, num_feats*wordDim))
    conv1d_2 = DimshuffleLayer(Conv1DLayer(reshape2, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_2 = MaxPool1DLayer(conv1d_2, pool_size=pool_size)  
    hid_2 = DenseLayer(maxpool_2, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_2 = DenseLayer(hid_2, num_units=5, nonlinearity=softmax)
    """

    #Type
    emb3 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape3 = ReshapeLayer(emb3, (batchsize, seqlen, num_feats * wordDim))
    conv1d_3 = DimshuffleLayer(
        Conv1DLayer(reshape3,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_3 = MaxPool1DLayer(conv1d_3, pool_size=pool_size)
    hid_3 = DenseLayer(maxpool_3,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_3 = DenseLayer(hid_3, num_units=4, nonlinearity=softmax)

    #Degree
    emb4 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape4 = ReshapeLayer(emb4, (batchsize, seqlen, num_feats * wordDim))
    conv1d_4 = DimshuffleLayer(
        Conv1DLayer(reshape4,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_4 = MaxPool1DLayer(conv1d_4, pool_size=pool_size)
    hid_4 = DenseLayer(maxpool_4,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_4 = DenseLayer(hid_4, num_units=4, nonlinearity=softmax)

    #Polarity
    emb5 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape5 = ReshapeLayer(emb5, (batchsize, seqlen, num_feats * wordDim))
    conv1d_5 = DimshuffleLayer(
        Conv1DLayer(reshape5,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_5 = MaxPool1DLayer(conv1d_5, pool_size=pool_size)
    hid_5 = DenseLayer(maxpool_5,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_5 = DenseLayer(hid_5, num_units=3, nonlinearity=softmax)

    #ContextualModality
    emb6 = EmbeddingLayer(input,
                          input_size=vocab_size,
                          output_size=wordDim,
                          W=wordEmbeddings.T)
    reshape6 = ReshapeLayer(emb6, (batchsize, seqlen, num_feats * wordDim))
    conv1d_6 = DimshuffleLayer(
        Conv1DLayer(reshape6,
                    num_filters=num_filters,
                    filter_size=wordDim,
                    stride=1,
                    nonlinearity=tanh,
                    W=GlorotUniform()), (0, 2, 1))
    maxpool_6 = MaxPool1DLayer(conv1d_6, pool_size=pool_size)
    hid_6 = DenseLayer(maxpool_6,
                       num_units=args.hiddenDim,
                       nonlinearity=sigmoid)
    network_6 = DenseLayer(hid_6, num_units=5, nonlinearity=softmax)
    """
    #ContextualAspect
    emb7 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    reshape7 = ReshapeLayer(emb7, (batchsize, seqlen, num_feats*wordDim))
    conv1d_7 = DimshuffleLayer(Conv1DLayer(reshape7, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_7 = MaxPool1DLayer(conv1d_7, pool_size=pool_size)  
    hid_7 = DenseLayer(maxpool_7, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_7 = DenseLayer(hid_7, num_units=4, nonlinearity=softmax)
    """
    """
    #Permanence
    emb8 = EmbeddingLayer(input, input_size=vocab_size, output_size=wordDim, W=wordEmbeddings.T)
    reshape8 = ReshapeLayer(emb8, (batchsize, seqlen, num_feats*wordDim))
    conv1d_8 = DimshuffleLayer(Conv1DLayer(reshape8, num_filters=num_filters, filter_size=wordDim, stride=1, 
        nonlinearity=tanh,W=GlorotUniform()), (0,2,1))
    maxpool_8 = MaxPool1DLayer(conv1d_8, pool_size=pool_size)  
    hid_8 = DenseLayer(maxpool_8, num_units=args.hiddenDim, nonlinearity=sigmoid)
    network_8 = DenseLayer(hid_8, num_units=4, nonlinearity=softmax)
    """

    # Is this important?
    """
    network_1_out, network_2_out, network_3_out, network_4_out, \
    network_5_out, network_6_out, network_7_out, network_8_out = \
    get_output([network_1, network_2, network_3, network_4, network_5, network_6, network_7, network_8])
    """
    network_1_out = get_output(network_1)
    network_3_out = get_output(network_3)
    network_4_out = get_output(network_4)
    network_5_out = get_output(network_5)
    network_6_out = get_output(network_6)

    loss_1 = T.mean(binary_crossentropy(
        network_1_out, target_var)) + regularize_layer_params_weighted(
            {
                emb1: lambda_val,
                conv1d_1: lambda_val,
                hid_1: lambda_val,
                network_1: lambda_val
            }, l2)
    updates_1 = adagrad(loss_1,
                        get_all_params(network_1, trainable=True),
                        learning_rate=args.step)
    train_fn_1 = theano.function([input_var, target_var],
                                 loss_1,
                                 updates=updates_1,
                                 allow_input_downcast=True)
    val_acc_1 = T.mean(
        binary_accuracy(get_output(network_1, deterministic=True), target_var))
    val_fn_1 = theano.function([input_var, target_var],
                               val_acc_1,
                               allow_input_downcast=True)
    """
    loss_2 = T.mean(categorical_crossentropy(network_2_out,target_var)) + regularize_layer_params_weighted({emb2:lambda_val, conv1d_2:lambda_val, 
                hid_2:lambda_val, network_2:lambda_val} , l2)
    updates_2 = adagrad(loss_2, get_all_params(network_2, trainable=True), learning_rate=args.step)
    train_fn_2 = theano.function([input_var, target_var], loss_2, updates=updates_2, allow_input_downcast=True)
    val_acc_2 =  T.mean(categorical_accuracy(get_output(network_2, deterministic=True), target_var))
    val_fn_2 = theano.function([input_var, target_var], val_acc_2, allow_input_downcast=True)
    """

    loss_3 = T.mean(categorical_crossentropy(
        network_3_out, target_var)) + regularize_layer_params_weighted(
            {
                emb3: lambda_val,
                conv1d_3: lambda_val,
                hid_3: lambda_val,
                network_3: lambda_val
            }, l2)
    updates_3 = adagrad(loss_3,
                        get_all_params(network_3, trainable=True),
                        learning_rate=args.step)
    train_fn_3 = theano.function([input_var, target_var],
                                 loss_3,
                                 updates=updates_3,
                                 allow_input_downcast=True)
    val_acc_3 = T.mean(
        categorical_accuracy(get_output(network_3, deterministic=True),
                             target_var))
    val_fn_3 = theano.function([input_var, target_var],
                               val_acc_3,
                               allow_input_downcast=True)

    loss_4 = T.mean(categorical_crossentropy(
        network_4_out, target_var)) + regularize_layer_params_weighted(
            {
                emb4: lambda_val,
                conv1d_4: lambda_val,
                hid_4: lambda_val,
                network_4: lambda_val
            }, l2)
    updates_4 = adagrad(loss_4,
                        get_all_params(network_4, trainable=True),
                        learning_rate=args.step)
    train_fn_4 = theano.function([input_var, target_var],
                                 loss_4,
                                 updates=updates_4,
                                 allow_input_downcast=True)
    val_acc_4 = T.mean(
        categorical_accuracy(get_output(network_4, deterministic=True),
                             target_var))
    val_fn_4 = theano.function([input_var, target_var],
                               val_acc_4,
                               allow_input_downcast=True)

    loss_5 = T.mean(categorical_crossentropy(
        network_5_out, target_var)) + regularize_layer_params_weighted(
            {
                emb5: lambda_val,
                conv1d_5: lambda_val,
                hid_5: lambda_val,
                network_5: lambda_val
            }, l2)
    updates_5 = adagrad(loss_5,
                        get_all_params(network_5, trainable=True),
                        learning_rate=args.step)
    train_fn_5 = theano.function([input_var, target_var],
                                 loss_5,
                                 updates=updates_5,
                                 allow_input_downcast=True)
    val_acc_5 = T.mean(
        categorical_accuracy(get_output(network_5, deterministic=True),
                             target_var))
    val_fn_5 = theano.function([input_var, target_var],
                               val_acc_5,
                               allow_input_downcast=True)

    loss_6 = T.mean(categorical_crossentropy(
        network_6_out, target_var)) + regularize_layer_params_weighted(
            {
                emb6: lambda_val,
                conv1d_6: lambda_val,
                hid_6: lambda_val,
                network_6: lambda_val
            }, l2)
    updates_6 = adagrad(loss_6,
                        get_all_params(network_6, trainable=True),
                        learning_rate=args.step)
    train_fn_6 = theano.function([input_var, target_var],
                                 loss_6,
                                 updates=updates_6,
                                 allow_input_downcast=True)
    val_acc_6 = T.mean(
        categorical_accuracy(get_output(network_6, deterministic=True),
                             target_var))
    val_fn_6 = theano.function([input_var, target_var],
                               val_acc_6,
                               allow_input_downcast=True)
    """
    loss_7 = T.mean(categorical_crossentropy(network_7_out,target_var)) + regularize_layer_params_weighted({emb7:lambda_val, conv1d_7:lambda_val, 
                hid_7:lambda_val, network_7:lambda_val} , l2)
    updates_7 = adagrad(loss_7, get_all_params(network_7, trainable=True), learning_rate=args.step)
    train_fn_7 = theano.function([input_var, target_var], loss_7, updates=updates_7, allow_input_downcast=True)
    val_acc_7 =  T.mean(categorical_accuracy(get_output(network_7, deterministic=True), target_var))
    val_fn_7 = theano.function([input_var, target_var], val_acc_7, allow_input_downcast=True)

    loss_8 = T.mean(categorical_crossentropy(network_8_out,target_var)) + regularize_layer_params_weighted({emb8:lambda_val, conv1d_8:lambda_val, 
                hid_8:lambda_val, network_8:lambda_val} , l2)
    updates_8 = adagrad(loss_8, get_all_params(network_8, trainable=True), learning_rate=args.step)
    train_fn_8 = theano.function([input_var, target_var], loss_8, updates=updates_8, allow_input_downcast=True)
    val_acc_8 =  T.mean(categorical_accuracy(get_output(network_8, deterministic=True), target_var))
    val_fn_8 = theano.function([input_var, target_var], val_acc_8, allow_input_downcast=True)
    """
    """
    return train_fn_1, val_fn_1, network_1, train_fn_2, val_fn_2, network_2, train_fn_3, val_fn_3, \
            network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \
            train_fn_6, val_fn_6, network_6, train_fn_7, val_fn_7, network_7, train_fn_8, val_fn_8, network_8
    """
    return train_fn_1, val_fn_1, network_1, train_fn_3, val_fn_3, \
            network_3, train_fn_4, val_fn_4, network_4, train_fn_5, val_fn_5, network_5, \
            train_fn_6, val_fn_6, network_6
Beispiel #41
0
 def get_params(self, values=True):
     if values:
         return get_all_param_values(self.net['conv5_1'])
     return get_all_params(self.net['conv5_1'], trainable=True)
Beispiel #42
0
                      axis=1)
    z = nn.log_sum_exp(z, axis=1)
    return n_plus, n_minus, z


if l_type == 'L2':
    n_plus = T.sum((a_lab - b_lab)**2, axis=1)
    n_minus = T.sum((a_lab - c_lab)**2, axis=1)
    dist = n_plus - n_minus + 10.0
    loss_lab = T.mean(dist * T.gt(dist, 0.0))
else:
    n_plus_lab, n_minus_lab, z_lab = loss_labeled(a_lab, b_lab, c_lab)
    loss_lab = -T.mean(n_minus_lab) + T.mean(z_lab)

lr = T.scalar()
disc_params = LL.get_all_params(layers, trainable=True)
disc_param_updates = nn.adam_updates(disc_params, loss_lab, lr=lr, mom1=0.5)
disc_param_avg = [
    th.shared(np.cast[th.config.floatX](0. * p.get_value()))
    for p in disc_params
]
disc_avg_updates = [(a, a + 0.0001 * (p - a))
                    for p, a in zip(disc_params, disc_param_avg)]
disc_avg_givens = [(p, a) for p, a in zip(disc_params, disc_param_avg)]

train_batch_disc = th.function(inputs=[x_lab, lr],
                               outputs=loss_lab,
                               updates=disc_param_updates + disc_avg_updates)

nr_batches_train = int(trainx.shape[0] / batch_size)
Beispiel #43
0
def main():
    setup_train_experiment(logger, FLAGS, "%(model)s_at")

    logger.info("Loading data...")
    data = mnist_load(FLAGS.train_size, FLAGS.seed)
    X_train, y_train = data.X_train, data.y_train
    X_val, y_val = data.X_val, data.y_val
    X_test, y_test = data.X_test, data.y_test

    img_shape = [None, 1, 28, 28]
    train_images = T.tensor4('train_images')
    train_labels = T.lvector('train_labels')
    val_images = T.tensor4('valid_labels')
    val_labels = T.lvector('valid_labels')

    layer_dims = [int(dim) for dim in FLAGS.layer_dims.split("-")]
    num_classes = layer_dims[-1]
    net = create_network(FLAGS.model, img_shape, layer_dims=layer_dims)
    model = with_end_points(net)

    train_outputs = model(train_images)
    val_outputs = model(val_images, deterministic=True)

    # losses
    train_ce = categorical_crossentropy(train_outputs['prob'],
                                        train_labels).mean()
    train_at = adversarial_training(lambda x: model(x)['prob'],
                                    train_images,
                                    train_labels,
                                    epsilon=FLAGS.epsilon).mean()
    train_loss = train_ce + FLAGS.lmbd * train_at
    val_ce = categorical_crossentropy(val_outputs['prob'], val_labels).mean()
    val_deepfool_images = deepfool(
        lambda x: model(x, deterministic=True)['logits'],
        val_images,
        val_labels,
        num_classes,
        max_iter=FLAGS.deepfool_iter,
        clip_dist=FLAGS.deepfool_clip,
        over_shoot=FLAGS.deepfool_overshoot)

    # metrics
    train_acc = categorical_accuracy(train_outputs['logits'],
                                     train_labels).mean()
    train_err = 1.0 - train_acc
    val_acc = categorical_accuracy(val_outputs['logits'], val_labels).mean()
    val_err = 1.0 - val_acc
    # deepfool robustness
    reduc_ind = range(1, train_images.ndim)
    l2_deepfool = (val_deepfool_images - val_images).norm(2, axis=reduc_ind)
    l2_deepfool_norm = l2_deepfool / val_images.norm(2, axis=reduc_ind)

    train_metrics = OrderedDict([('loss', train_loss), ('nll', train_ce),
                                 ('at', train_at), ('err', train_err)])
    val_metrics = OrderedDict([('nll', val_ce), ('err', val_err)])
    summary_metrics = OrderedDict([('l2', l2_deepfool.mean()),
                                   ('l2_norm', l2_deepfool_norm.mean())])

    lr = theano.shared(floatX(FLAGS.initial_learning_rate), 'learning_rate')
    train_params = get_all_params(net, trainable=True)
    train_updates = adam(train_loss, train_params, lr)

    logger.info("Compiling theano functions...")
    train_fn = theano.function([train_images, train_labels],
                               outputs=train_metrics.values(),
                               updates=train_updates)
    val_fn = theano.function([val_images, val_labels],
                             outputs=val_metrics.values())
    summary_fn = theano.function([val_images, val_labels],
                                 outputs=summary_metrics.values() +
                                 [val_deepfool_images])

    logger.info("Starting training...")
    try:
        samples_per_class = FLAGS.summary_samples_per_class
        summary_images, summary_labels = select_balanced_subset(
            X_val, y_val, num_classes, samples_per_class)
        save_path = os.path.join(FLAGS.samples_dir, 'orig.png')
        save_images(summary_images, save_path)

        epoch = 0
        batch_index = 0
        while epoch < FLAGS.num_epochs:
            epoch += 1

            start_time = time.time()
            train_iterator = batch_iterator(X_train,
                                            y_train,
                                            FLAGS.batch_size,
                                            shuffle=True)
            epoch_outputs = np.zeros(len(train_fn.outputs))
            for batch_index, (images,
                              labels) in enumerate(train_iterator,
                                                   batch_index + 1):
                batch_outputs = train_fn(images, labels)
                epoch_outputs += batch_outputs
            epoch_outputs /= X_train.shape[0] // FLAGS.batch_size
            logger.info(
                build_result_str(
                    "Train epoch [{}, {:.2f}s]:".format(
                        epoch,
                        time.time() - start_time), train_metrics.keys(),
                    epoch_outputs))

            # update learning rate
            if epoch > FLAGS.start_learning_rate_decay:
                new_lr_value = lr.get_value(
                ) * FLAGS.learning_rate_decay_factor
                lr.set_value(floatX(new_lr_value))
                logger.debug("learning rate was changed to {:.10f}".format(
                    new_lr_value))

            # validation
            start_time = time.time()
            val_iterator = batch_iterator(X_val,
                                          y_val,
                                          FLAGS.test_batch_size,
                                          shuffle=False)
            val_epoch_outputs = np.zeros(len(val_fn.outputs))
            for images, labels in val_iterator:
                val_epoch_outputs += val_fn(images, labels)
            val_epoch_outputs /= X_val.shape[0] // FLAGS.test_batch_size
            logger.info(
                build_result_str(
                    "Test epoch [{}, {:.2f}s]:".format(
                        epoch,
                        time.time() - start_time), val_metrics.keys(),
                    val_epoch_outputs))

            if epoch % FLAGS.summary_frequency == 0:
                summary = summary_fn(summary_images, summary_labels)
                logger.info(
                    build_result_str(
                        "Epoch [{}] adversarial statistics:".format(epoch),
                        summary_metrics.keys(), summary[:-1]))
                save_path = os.path.join(FLAGS.samples_dir,
                                         'epoch-%d.png' % epoch)
                df_images = summary[-1]
                save_images(df_images, save_path)

            if epoch % FLAGS.checkpoint_frequency == 0:
                save_network(net, epoch=epoch)
    except KeyboardInterrupt:
        logger.debug("Keyboard interrupt. Stopping training...")
    finally:
        save_network(net)

    # evaluate final model on test set
    test_iterator = batch_iterator(X_test,
                                   y_test,
                                   FLAGS.test_batch_size,
                                   shuffle=False)
    test_results = np.zeros(len(val_fn.outputs))
    for images, labels in test_iterator:
        test_results += val_fn(images, labels)
    test_results /= X_test.shape[0] // FLAGS.test_batch_size
    logger.info(
        build_result_str("Final test results:", val_metrics.keys(),
                         test_results))
Beispiel #44
0
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size,
                 l2, mode, rnn_num_units, batch_norm, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.l2 = l2
        self.mode = mode
        self.num_units = rnn_num_units
        self.batch_norm = batch_norm

        self.input_var = T.tensor3('input_var')
        self.answer_var = T.ivector('answer_var')

        # scale inputs to be in [-1, 1]
        input_var_norm = 2 * self.input_var - 1

        print "==> building network"
        example = np.random.uniform(size=(self.batch_size, 858, 256),
                                    low=0.0,
                                    high=1.0).astype(np.float32)  #########
        answer = np.random.randint(low=0, high=176,
                                   size=(self.batch_size, ))  #########

        # InputLayer
        network = layers.InputLayer(shape=(None, 858, 256),
                                    input_var=input_var_norm)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # GRULayer
        network = layers.GRULayer(incoming=network, num_units=self.num_units)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # BatchNormalization Layer
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
            print layers.get_output(network).eval({
                self.input_var: example
            }).shape

        # GRULayer
        network = layers.GRULayer(incoming=network,
                                  num_units=self.num_units,
                                  only_return_final=True)
        print layers.get_output(network).eval({self.input_var: example}).shape

        # Last layer: classification
        network = layers.DenseLayer(incoming=network,
                                    num_units=176,
                                    nonlinearity=softmax)
        print layers.get_output(network).eval({self.input_var: example}).shape

        self.params = layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)

        self.loss_ce = lasagne.objectives.categorical_crossentropy(
            self.prediction, self.answer_var).mean()
        if (self.l2 > 0):
            self.loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(
                network, lasagne.regularization.l2)
        else:
            self.loss_l2 = 0
        self.loss = self.loss_ce + self.loss_l2

        #updates = lasagne.updates.adadelta(self.loss, self.params)
        updates = lasagne.updates.momentum(self.loss,
                                           self.params,
                                           learning_rate=0.003)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[self.input_var, self.answer_var],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[self.input_var, self.answer_var],
            outputs=[self.prediction, self.loss])
Beispiel #45
0
    def __init__(self,
                 retina_model,
                 seeder_model,
                 n_seeds,
                 n_steps,
                 n_units=100,
                 normalization_coefs=None,
                 loss_coefs=None,
                 alpha=1.0,
                 threshold=1.0):
        self.seeder_model = seeder_model
        self.n_seeds = n_seeds
        self.n_steps = n_steps

        self.threshold = threshold

        self.retina = retina_model

        event_shareds = retina_model.get_event_variables()

        self.seeder = self.seeder_model(retina_model)

        if normalization_coefs is None:
            normalization_coefs = np.ones(shape=retina_model.model_nparams,
                                          dtype='float32')
        else:
            normalization_coefs = np.array(normalization_coefs,
                                           dtype='float32')

        ### params + sigma
        self.inputs = retina_model.alloc_model_params()

        self.input_layer, self.out_layer, self.reg = self.build_nn(
            retina_model.model_nparams, n_units=n_units)

        print 'Linking to Retina Model'

        iterations = [self.inputs]
        responses = []

        for i in xrange(self.n_steps):
            print 'Iteration %d' % i

            prev = iterations[i]
            r, grads = retina_model.grad_for(*event_shareds + prev)

            normed_params = [p * c for p, c in zip(prev, normalization_coefs)]

            normed_grads = [g * c for g, c in zip(grads, normalization_coefs)]

            out = self.get_update_for(normed_params, r, normed_grads)

            param_updates = [out[:, i] for i in range(len(self.inputs))]

            track_param_updates, sigma_update = param_updates[:
                                                              -1], param_updates[
                                                                  -1]

            ### sigma (last parameter) is updated simply by replacing
            ### previous variable
            update = [
                var + upd * alpha
                for var, upd in zip(prev[:-1], track_param_updates)
            ] + [T.exp(-sigma_update)]

            for var, upd, new in zip(prev[:-1], track_param_updates, update):
                print '  -', new, '=', var, '+ %.2e' % alpha, upd

            iterations.append(update)
            responses.append(r)

        prediction = iterations[-1]

        sigma_train = T.fscalar('sigma_train')

        ### Except sigma
        self.true_parameters_shareds = [
            theano.shared(np.ndarray(shape=(0, ), dtype='float32'), name=name)
            for name in retina_model.model_params_names[:-1]
        ]

        ### predictions without sigma
        print 'Constucting loss:'
        print '  - Loss coefs:', loss_coefs
        print '  - True params shared:', self.true_parameters_shareds
        print '  - Predictions:', prediction[:-1]
        print '  - Sigma:', sigma_train

        pure_response, rmse = retina_model.parameter_response(
            loss_coefs,
            *self.true_parameters_shareds + prediction[:-1] + [sigma_train])

        pure_loss = 1.0 - pure_response

        initial_response, initial_rmse = retina_model.parameter_response(
            loss_coefs,
            *self.true_parameters_shareds + self.inputs[:-1] + [sigma_train])

        initial_loss = 1.0 - initial_response

        reg_c = T.fscalar('reg_c')
        alpha_rmse = T.fscalar('reg_c')

        loss = (1.0 -
                alpha_rmse) * pure_loss + alpha_rmse * rmse + reg_c * self.reg

        params = layers.get_all_params(self.out_layer)
        learning_rate = T.fscalar('learning rate')

        net_updates = updates.adadelta(loss,
                                       params,
                                       learning_rate=learning_rate)

        self._train = theano.function(
            self.inputs + [sigma_train, learning_rate, reg_c, alpha_rmse],
            [pure_loss, rmse, self.reg, loss, initial_loss, initial_rmse],
            updates=net_updates)

        self._loss = theano.function(self.inputs + [sigma_train], pure_loss)

        outputs = [v for it in iterations for v in it]

        self.ndim = len(self.inputs)

        self.predictions = theano.function(self.inputs, responses + outputs)

        self.responses = None
        self.traces = None
        self.seeds = None
Beispiel #46
0
    def buildModel(self):
        print(' -- Building...')
        x_init = sparse.csr_matrix('x', dtype='float32')
        y_init = T.imatrix('y')
        gx_init = sparse.csr_matrix('gx', dtype='float32')
        gy_init = T.ivector('gy')
        gz_init = T.vector('gz')
        mask_init = T.fmatrix('subMask')

        # step train
        x_input = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                 input_var=x_init)
        x_to_label = layers.SparseLayer(x_input, self.y.shape[1],
                                        nonlinearity=lg.nonlinearities.softmax)
        x_to_emd = layers.SparseLayer(x_input, self.embedding_size)
        W = x_to_emd.W
        x_to_emd = layers.DenseLayer(x_to_emd, self.y.shape[1],
                                     nonlinearity=lg.nonlinearities.softmax)
        x_concat = lgl.ConcatLayer([x_to_label, x_to_emd], axis=1)
        x_concat = layers.DenseLayer(x_concat, self.y.shape[1],
                                     nonlinearity=lg.nonlinearities.softmax)
        pred = lgl.get_output(x_concat)
        step_loss = lgo.categorical_crossentropy(pred, y_init).mean()
        hid_loss = lgl.get_output(x_to_label)
        step_loss += lgo.categorical_crossentropy(hid_loss, y_init).mean()
        emd_loss = lgl.get_output(x_to_emd)
        step_loss += lgo.categorical_crossentropy(emd_loss, y_init).mean()
        step_params = lgl.get_all_params(x_concat)
        step_updates = lg.updates.sgd(step_loss, step_params,
                                      learning_rate=self.step_learning_rate)
        self.step_train = theano.function([x_init, y_init], step_loss,
                                          updates=step_updates)
        self.test_fn = theano.function([x_init], pred)

        # supervised train
        gx_input = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                  input_var=gx_init)
        gx_to_emd = layers.SparseLayer(gx_input, self.embedding_size, W=W)
        gx_to_emd = lgl.DenseLayer(gx_to_emd, self.num_ver,
                                   nonlinearity=lg.nonlinearities.softmax)
        gx_pred = lgl.get_output(gx_to_emd)
        g_loss = lgo.categorical_crossentropy(gx_pred, gy_init).sum()
        sup_params = lgl.get_all_params(gx_to_emd)
        sup_updates = lg.updates.sgd(g_loss, sup_params,
                                     learning_rate=self.sup_learning_rate)
        self.sup_train = theano.function([gx_init, gy_init, gz_init], g_loss,
                                         updates=sup_updates,
                                         on_unused_input='ignore')

        # handle lstm input
        cross_entropy = lgo.categorical_crossentropy(gx_pred, gy_init)
        cross_entropy = T.reshape(cross_entropy, (1, self.subpath_num), ndim=None)
        mask_input = lgl.InputLayer(shape=(None, self.window_size + 1),
                                    input_var=mask_init)
        sub_path_batch1 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input1 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch1)
        sub_path_batch2 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input2 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch2)
        sub_path_batch3 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input3 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch3)
        sub_path_batch4 = sparse.csr_matrix('x', dtype='float32')
        sub_path_input4 = lgl.InputLayer(shape=(None, self.x.shape[1]),
                                         input_var=sub_path_batch4)
        sub_path_emd1 = layers.SparseLayer(sub_path_input1, self.embedding_size,
                                           W=W)
        sub_path_emd1 = T.reshape(lgl.get_output(sub_path_emd1),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_emd2 = layers.SparseLayer(sub_path_input2,
                                           self.embedding_size, W=W)
        sub_path_emd2 = T.reshape(lgl.get_output(sub_path_emd2),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_emd3 = layers.SparseLayer(sub_path_input3, self.embedding_size,
                                           W=W)
        sub_path_emd3 = T.reshape(lgl.get_output(sub_path_emd3),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_emd4 = layers.SparseLayer(sub_path_input4, self.embedding_size,
                                           W=W)
        sub_path_emd4 = T.reshape(lgl.get_output(sub_path_emd4),
                                  (self.subpath_num, 1, self.embedding_size))
        sub_path_concat = T.concatenate([sub_path_emd1, sub_path_emd2,
                                         sub_path_emd3, sub_path_emd4], axis=1)
        sub_path_concat_layer = lgl.InputLayer(shape=(None, self.window_size + 1,
                                                      self.embedding_size),
                                               input_var=sub_path_concat)

        # lstm layer
        lstm_layer = lgl.LSTMLayer(sub_path_concat_layer,
                                   self.lstm_hidden_units,
                                   grad_clipping=3,
                                   mask_input=mask_input)

        # handle path weight
        max1 = T.mean(lgl.get_output(lstm_layer), axis=1)
        max2 = T.mean(max1, axis=1)
        max2_init = T.fcol('max2')
        max2_init = T.reshape(max2, ((self.subpath_num, 1)))
        max2_input = lgl.InputLayer(shape=(self.subpath_num, 1),
                                    input_var=max2_init)
        max2_input = lgl.BatchNormLayer(max2_input)
        path_weight = lgl.get_output(max2_input)
        path_weight = lg.nonlinearities.sigmoid(path_weight)
        path_weight = 1 + 0.3 * path_weight

        # unsupervised train
        reweight_loss = T.dot(cross_entropy, path_weight)[0][0]
        lstm_params = lgl.get_all_params(lstm_layer, trainable=True)
        lstm_updates = lg.updates.sgd(reweight_loss, lstm_params,
                                      learning_rate=0.01)
        self.lstm_fn = theano.function([gx_init, gy_init, gz_init,
                                        sub_path_batch1, sub_path_batch2,
                                        sub_path_batch3, sub_path_batch4,
                                        mask_init],
                                       reweight_loss,
                                       updates=lstm_updates,
                                       on_unused_input='ignore')
        alpha_updates = lg.updates.sgd(reweight_loss, sup_params,
                                       learning_rate=0.001)
        self.alpha_fn = theano.function([gx_init, gy_init, gz_init,
                                         sub_path_batch1, sub_path_batch2,
                                         sub_path_batch3, sub_path_batch4,
                                         mask_init],
                                        reweight_loss,
                                        updates=alpha_updates,
                                        on_unused_input='ignore')

        print(' -- Done!')
Beispiel #47
0
    l_output = DenseLayer(l_hidden,
                          num_units=len(classes),
                          nonlinearity=softmax,
                          W=Constant())
    # Now, we can generate the symbolic expression of the network's output given an input variable.
    net_input = T.matrix('net_input')
    net_output = l_output.get_output(net_input)

    # As a loss function, we'll use Theano's categorical_crossentropy function.
    # This allows for the network output to be class probabilities,
    # but the target output to be class labels.
    true_output = T.ivector('true_output')
    loss = T.mean(T.nnet.categorical_crossentropy(net_output, true_output))
    # Retrieving all parameters of the network is done using get_all_params,
    # which recursively collects the parameters of all layers connected to the provided layer.
    all_params = get_all_params(l_output)
    # Now, we'll generate updates using Lasagne's SGD function
    updates = sgd(loss, all_params, learning_rate=0.01)
    # Finally, we can compile Theano functions for training and computing the output.
    training = function([net_input, true_output], loss, updates=updates)
    prediction = function([net_input], net_output)

    # Train for 100 epochs
    print 'epoch  logloss'
    for k, n in enumerate(xrange(100)):
        # this is logloss
        res = training(trainT, classT)
        print '{0:.3d}  {1:.4f}'.format(k, res)

    # Compute the predicted label of the training data.
    # The argmax converts the class probability output to class label
Beispiel #48
0
disc_layers = [ll.InputLayer(shape=(None, 3, 32, 32))]
disc_layers.append(ll.DropoutLayer(disc_layers[-1], p=0.2))
disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 96, (3,3), pad=1, W=Normal(0.05), nonlinearity=nn.lrelu)))
disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 96, (3,3), pad=1, W=Normal(0.05), nonlinearity=nn.lrelu)))
disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 96, (3,3), pad=1, stride=2, W=Normal(0.05), nonlinearity=nn.lrelu)))
disc_layers.append(ll.DropoutLayer(disc_layers[-1], p=0.5))
disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 192, (3,3), pad=1, W=Normal(0.05), nonlinearity=nn.lrelu)))
disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 192, (3,3), pad=1, W=Normal(0.05), nonlinearity=nn.lrelu)))
disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 192, (3,3), pad=1, stride=2, W=Normal(0.05), nonlinearity=nn.lrelu)))
disc_layers.append(ll.DropoutLayer(disc_layers[-1], p=0.5))
disc_layers.append(nn.weight_norm(dnn.Conv2DDNNLayer(disc_layers[-1], 192, (3,3), pad=0, W=Normal(0.05), nonlinearity=nn.lrelu)))
disc_layers.append(nn.weight_norm(ll.NINLayer(disc_layers[-1], num_units=192, W=Normal(0.05), nonlinearity=nn.lrelu)))
disc_layers.append(nn.weight_norm(ll.NINLayer(disc_layers[-1], num_units=192, W=Normal(0.05), nonlinearity=nn.lrelu)))
disc_layers.append(ll.GlobalPoolLayer(disc_layers[-1]))
disc_layers.append(nn.weight_norm(ll.DenseLayer(disc_layers[-1], num_units=16, W=Normal(0.05), nonlinearity=None), train_g=True, init_stdv=0.1))
disc_params = ll.get_all_params(disc_layers, trainable=True)

x_temp = T.tensor4()

temp = ll.get_output(gen_layers[-1], deterministic=False, init=True)
temp = ll.get_output(disc_layers[-1], x_temp, deterministic=False, init=True)
init_updates = [u for l in gen_layers+disc_layers for u in getattr(l,'init_updates',[])]

init_param = th.function(inputs=[x_temp], outputs=None, updates=init_updates)

# costs
labels = T.ivector()
x_lab = T.tensor4()
x_unl = T.tensor4()

output_before_softmax_lab = ll.get_output(disc_layers[-1], x_lab, deterministic=False)
Beispiel #49
0
target_var = T.imatrix('targets');

input_layer_index = map(lambda pair : pair[0], ae.layers).index('input');
first_layer = ae.get_all_layers()[input_layer_index + 1];
input_layer = layers.InputLayer(shape=(None, 3, 32, 32), input_var=input_var);
first_layer.input_layer = input_layer;

encode_layer_index = map(lambda pair : pair[0], ae.layers).index('encode_layer');
encode_layer = ae.get_all_layers()[encode_layer_index];
fc_layer = layers.DenseLayer(incoming = encode_layer, num_units = 30, nonlinearity = rectify);
network = layers.DenseLayer(incoming = fc_layer, num_units = classn, nonlinearity = sigmoid);

prediction = layers.get_output(network);
loss = lasagne.objectives.binary_crossentropy(prediction, target_var).mean();

params = layers.get_all_params(network, trainable=True);
updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=0.0005, momentum=0.975);

test_output = lasagne.layers.get_output(network, deterministic=True);
test_loss = lasagne.objectives.binary_crossentropy(test_output, target_var).mean();

test_acc = T.mean(T.eq(T.gt(test_output, 0.5), target_var), dtype=theano.config.floatX);
test_pred = T.gt(test_output, 0.5);

train_fn = theano.function([input_var, target_var], loss, updates=updates);
val_fn = theano.function([input_var, target_var], [test_loss, test_acc, test_pred]);

print("Starting training...");
print("TrLoss\t\tVaLoss\t\tVaAcc\t\tEpochs\t\tTime");
sys.stdout.flush();
num_epochs = 300;
Beispiel #50
0
    def train(self):

        self.G_weights_layer = nn.softmax_weights(self.args.ng, LL.InputLayer(shape=(), input_var=self.dummy_input))
        self.D_weights_layer = nn.softmax_weights(self.args.ng, LL.InputLayer(shape=(), input_var=self.dummy_input))

        self.G_weights = LL.get_output(self.G_weights_layer, None, deterministic=True)
        self.D_weights = LL.get_output(self.D_weights_layer, None, deterministic=True)

        self.Disc_weights_entropy = T.sum((-1./self.args.nd) * T.log(self.D_weights + 0.000001), [0,1])
        self.Gen_weights_entropy = T.sum((-1./self.args.ng) * T.log(self.G_weights + 0.000001), [0,1]) 

        for i in range(self.args.ng):
            gen_layers_i, gen_x_i = self.get_generator(self.meanx, self.z, self.y_1hot)
            self.G_layers.append(gen_layers_i)
            self.Gen_x_list.append(gen_x_i)
        self.Gen_x = T.concatenate(self.Gen_x_list, axis=0)

        for i in range(self.args.nd):
            disc_layers_i, disc_layer_adv_i, disc_layer_z_recon_i = self.get_discriminator()
            self.D_layers.append(disc_layers_i)
            self.D_layer_adv.append(disc_layer_adv_i)
            self.D_layer_z_recon.append(disc_layer_z_recon_i)
            #T.set_subtensor(self.Gen_x[i*self.args.batch_size:(i+1)*self.args.batch_size], gen_x_i)

            #self.samplers.append(self.sampler(self.z[i], self.y))
        ''' forward pass '''
        loss_gen0_cond_list = []
        loss_disc0_class_list = []
        loss_disc0_adv_list = []
        loss_gen0_ent_list = []
        loss_gen0_adv_list = []
        #loss_disc_list
        
        for i in range(self.args.ng):
            self.y_recon_list.append(LL.get_output(self.enc_layer_fc4, self.Gen_x_list[i], deterministic=True)) # reconstructed pool3 activations

        for i in range(self.args.ng):
            #loss_gen0_cond = T.mean((recon_fc3_list[i] - self.real_fc3)**2) # feature loss, euclidean distance in feature space
            loss_gen0_cond = T.mean(T.nnet.categorical_crossentropy(self.y_recon_list[i], self.y))
            loss_disc0_class = 0
            loss_disc0_adv = 0
            loss_gen0_ent = 0
            loss_gen0_adv = 0
            for j in range(self.args.nd):
                output_before_softmax_real0 = LL.get_output(self.D_layer_adv[j], self.x, deterministic=False) 
                output_before_softmax_gen0, recon_z0 = LL.get_output([self.D_layer_adv[j], self.D_layer_z_recon[j]], self.Gen_x_list[i], deterministic=False) # discriminator's predicted probability that gen_x is real
                ''' loss for discriminator and Q '''
                l_lab0 = output_before_softmax_real0[T.arange(self.args.batch_size),self.y]
                l_unl0 = nn.log_sum_exp(output_before_softmax_real0)
                l_gen0 = nn.log_sum_exp(output_before_softmax_gen0)
                loss_disc0_class += T.dot(self.D_weights[0,j], -T.mean(l_lab0) + T.mean(T.mean(nn.log_sum_exp(output_before_softmax_real0)))) # loss for not correctly classifying the category of real images
                loss_real0 = -T.mean(l_unl0) + T.mean(T.nnet.softplus(l_unl0)) # loss for classifying real as fake
                loss_fake0 = T.mean(T.nnet.softplus(l_gen0)) # loss for classifying fake as real
                loss_disc0_adv += T.dot(self.D_weights[0,j], 0.5*loss_real0 + 0.5*loss_fake0)
                loss_gen0_ent += T.dot(self.D_weights[0,j], T.mean((recon_z0 - self.z)**2))
                #loss_gen0_ent = T.mean((recon_z0 - self.z)**2)
                ''' loss for generator '''
                loss_gen0_adv += T.dot(self.D_weights[0,j], -T.mean(T.nnet.softplus(l_gen0)))

            loss_gen0_cond_list.append(T.dot(self.G_weights[0,i], loss_gen0_cond))
            loss_disc0_class_list.append(T.dot(self.G_weights[0,i], loss_disc0_class))
            loss_disc0_adv_list.append(T.dot(self.G_weights[0,i], loss_disc0_adv))
            loss_gen0_ent_list.append(T.dot(self.G_weights[0,i], loss_gen0_ent))
            loss_gen0_adv_list.append(T.dot(self.G_weights[0,i], loss_gen0_adv))

        self.loss_gen0_cond = sum(loss_gen0_cond_list)
        self.loss_disc0_class = sum(loss_disc0_class_list)
        self.loss_disc0_adv = sum(loss_disc0_adv_list)
        self.loss_gen0_ent = sum(loss_gen0_ent_list)
        self.loss_gen0_adv = sum(loss_gen0_adv_list)

        self.loss_disc = self.args.labloss_weight * self.loss_disc0_class + self.args.advloss_weight * self.loss_disc0_adv + self.args.entloss_weight * self.loss_gen0_ent + self.args.mix_entloss_weight * self.Disc_weights_entropy
        self.loss_gen = self.args.advloss_weight * self.loss_gen0_adv + self.args.condloss_weight * self.loss_gen0_cond + self.args.entloss_weight * self.loss_gen0_ent + self.args.mix_entloss_weight * self.Gen_weights_entropy

        if self.args.load_epoch is not None:
            print("loading model")
            self.load_model(self.args.load_epoch)
            print("success")

        ''' collect parameter updates for discriminators '''
        Disc_params = LL.get_all_params(self.D_weights_layer, trainable=True)
        Disc_bn_updates = []
        Disc_bn_params = []

        self.threshold = self.mincost + self.args.labloss_weight * self.loss_disc0_class + self.args.entloss_weight * self.loss_gen0_ent + self.args.mix_entloss_weight * self.Disc_weights_entropy
        #threshold = mincost + self.args.labloss_weight * self.loss_disc0_class + self.args.entloss_weight * self.loss_gen0_ent

        for i in range(self.args.nd):
            Disc_params.extend(LL.get_all_params(self.D_layers[i], trainable=True))
            Disc_bn_updates.extend([u for l in LL.get_all_layers(self.D_layers[i][-1]) for u in getattr(l,'bn_updates',[])])
            for l in LL.get_all_layers(self.D_layers[i][-1]):
                if hasattr(l, 'avg_batch_mean'):
                    Disc_bn_params.append(l.avg_batch_mean)
                    Disc_bn_params.append(l.avg_batch_var)
        Disc_param_updates = nn.adam_conditional_updates(Disc_params, self.loss_disc, mincost=self.threshold, lr=self.disc_lr, mom1=0.5) # if loss_disc_x < mincost, don't update the discriminator
        Disc_param_avg = [th.shared(np.cast[th.config.floatX](0.*p.get_value())) for p in Disc_params] # initialized with 0
        Disc_avg_updates = [(a,a+0.0001*(p-a)) for p,a in zip(Disc_params, Disc_param_avg)] # online update of historical parameters

        """
        #Disc_param_updates = nn.adam_updates(Disc_params, self.loss_disc, lr=self.lr, mom1=0.5) 
        # collect parameters
        #Disc_params = LL.get_all_params(self.D_layers[-1], trainable=True)
        Disc_params = LL.get_all_params(self.D_layers, trainable=True)
        #Disc_param_updates = nn.adam_updates(Disc_params, loss_disc_x, lr=lr, mom1=0.5) # loss for discriminator = supervised_loss + unsupervised loss
        Disc_param_updates = nn.adam_conditional_updates(Disc_params, self.loss_disc, mincost=threshold, lr=self.disc_lr, mom1=0.5) # if loss_disc_x < mincost, don't update the discriminator
        Disc_param_avg = [th.shared(np.cast[th.config.floatX](0.*p.get_value())) for p in Disc_params] # initialized with 0
        Disc_avg_updates = [(a,a+0.0001*(p-a)) for p,a in zip(Disc_params,Disc_param_avg)] # online update of historical parameters
        #Disc_avg_givens = [(p,a) for p,a in zip(Disc_params,Disc_param_avg)]
        Disc_bn_updates = [u for l in LL.get_all_layers(self.D_layers[-1]) for u in getattr(l,'bn_updates',[])]
        Disc_bn_params = []
        for l in LL.get_all_layers(self.D_layers[-1]):
            if hasattr(l, 'avg_batch_mean'):
                Disc_bn_params.append(l.avg_batch_mean)
                Disc_bn_params.append(l.avg_batch_var)
        """


        ''' collect parameter updates for generators '''
        Gen_params = LL.get_all_params(self.G_weights_layer, trainable=True)
        Gen_params_updates = []
        Gen_bn_updates = []
        Gen_bn_params = []

        for i in range(self.args.ng):
            Gen_params.extend(LL.get_all_params(self.G_layers[i][-1], trainable=True))
            Gen_bn_updates.extend([u for l in LL.get_all_layers(self.G_layers[i][-1]) for u in getattr(l,'bn_updates',[])])
            for l in LL.get_all_layers(self.G_layers[i][-1]):
                if hasattr(l, 'avg_batch_mean'):
                    Gen_bn_params.append(l.avg_batch_mean)
                    Gen_bn_params.append(l.avg_batch_var)
        Gen_param_updates = nn.adam_updates(Gen_params, self.loss_gen, lr=self.gen_lr, mom1=0.5) 
        """
        #print(Gen_params)
        #train_batch_gen = th.function(inputs=[self.x, self.meanx, self.z, self.y_1hot, self.lr], outputs=[self.loss_gen], on_unused_input='warn')
        #theano.printing.debugprint(train_batch_gen) 
        Gen_param_updates = nn.adam_updates(Gen_params, self.loss_gen, lr=self.lr, mom1=0.5) 
        Gen_params = LL.get_all_params(self.G_layers[-1], trainable=True)
        Gen_param_updates = nn.adam_updates(Gen_params, self.loss_gen, lr=self.gen_lr, mom1=0.5)
        Gen_bn_updates = [u for l in LL.get_all_layers(self.G_layers[-1]) for u in getattr(l,'bn_updates',[])]
        Gen_bn_params = []
        for l in LL.get_all_layers(self.G_layers[-1]):
            if hasattr(l, 'avg_batch_mean'):
                Gen_bn_params.append(l.avg_batch_mean)
                Gen_bn_params.append(l.avg_batch_var)
         """

        ''' define training and testing functions '''
        #train_batch_disc = th.function(inputs=[x, meanx, y, lr], outputs=[loss_disc0_class, loss_disc0_adv, gen_x, x], 
        #    updates=disc0_param_updates+disc0_bn_updates) 
        #th.printing.debugprint(self.loss_disc)  
        train_batch_disc = th.function(inputs=[self.dummy_input, self.meanx, self.x, self.y, self.y_1hot, self.mincost, self.disc_lr], outputs=[self.loss_disc0_class, self.loss_disc0_adv], updates=Disc_param_updates+Disc_bn_updates+Disc_avg_updates) 
        #th.printing.pydotprint(train_batch_disc, outfile="logreg_pydotprint_prediction.png", var_with_name_simple=True)  
        #train_batch_gen = th.function(inputs=[x, meanx, y_1hot, lr], outputs=[loss_gen0_adv, loss_gen0_cond, loss_gen0_ent], 
        #    updates=gen0_param_updates+gen0_bn_updates)
        #train_batch_gen = th.function(inputs=gen_inputs, outputs=gen_outputs, updates=gen0_param_updates+gen0_bn_updates)
        #train_batch_gen = th.function(inputs=[self.dummy_input, self.x, self.meanx, self.z, self.y_1hot, self.lr], outputs=[self.loss_gen0_adv, self.loss_gen0_cond, self.loss_gen0_ent], updates=Gen_param_updates+Gen_bn_updates)
        train_batch_gen = th.function(inputs=[self.dummy_input, self.meanx, self.y, self.y_1hot, self.gen_lr], outputs=[self.loss_gen0_adv, self.loss_gen0_cond, self.loss_gen0_ent], updates=Gen_param_updates+Gen_bn_updates)
        

        # samplefun = th.function(inputs=[meanx, y_1hot], outputs=gen_x_joint)   # sample function: generating images by stacking all generators
        reconfun = th.function(inputs=[self.meanx, self.y_1hot], outputs=self.Gen_x)       # reconstruction function: use the bottom generator 
                                                                # to generate images conditioned on real fc3 features
        mix_weights = th.function(inputs=[self.dummy_input], outputs=[self.D_weights, self.Disc_weights_entropy, self.G_weights, self.Gen_weights_entropy])

        ''' load data '''
        print("Loading data...")
        meanimg, data = load_cifar_data(self.args.data_dir)
        trainx = data['X_train']
        trainy = data['Y_train']
        nr_batches_train = int(trainx.shape[0]/self.args.batch_size)
        # testx = data['X_test']
        # testy = data['Y_test']
        # nr_batches_test = int(testx.shape[0]/self.args.batch_size)

        ''' perform training  ''' 
        #logs = {'loss_gen0_adv': [], 'loss_gen0_cond': [], 'loss_gen0_ent': [], 'loss_disc0_class': [], 'var_gen0': [], 'var_real0': []} # training logs
        logs = {'loss_gen0_adv': [], 'loss_gen0_cond': [], 'loss_gen0_ent': [], 'loss_disc0_class': []} # training logs
        for epoch in range(self.args.load_epoch+1, self.args.num_epoch):
            begin = time.time()

            ''' shuffling '''
            inds = rng.permutation(trainx.shape[0])
            trainx = trainx[inds]
            trainy = trainy[inds]

            for t in range(nr_batches_train):
            #for t in range(1):
                ''' construct minibatch '''
                #batchz = np.random.uniform(size=(self.args.batch_size, self.args.z0dim)).astype(np.float32)
                batchx = trainx[t*self.args.batch_size:(t+1)*self.args.batch_size]
                batchy = trainy[t*self.args.batch_size:(t+1)*self.args.batch_size]
                batchy_1hot = np.zeros((self.args.batch_size, 10), dtype=np.float32)
                batchy_1hot[np.arange(self.args.batch_size), batchy] = 1 # convert to one-hot label
                # randomy = np.random.randint(10, size = (self.args.batch_size,))
                # randomy_1hot = np.zeros((self.args.batch_size, 10),dtype=np.float32)
                # randomy_1hot[np.arange(self.args.batch_size), randomy] = 1

                ''' train discriminators '''
                l_disc0_class, l_disc0_adv = train_batch_disc(0.0, meanimg, batchx, batchy, batchy_1hot, self.args.mincost, self.args.disc_lr)

                ''' train generators '''
                #prob_gen0 = np.exp()
                if l_disc0_adv > 0.65:
                    n_iter = 1
                elif l_disc0_adv > 0.5:
                    n_iter = 3
                elif l_disc0_adv > 0.3:
                    n_iter = 5
                else:
                    n_iter = 7
                for i in range(n_iter):
                    #l_gen0_adv, l_gen0_cond, l_gen0_ent = train_batch_gen(0.0, batchx, meanimg, batchz, batchy_1hot, self.args.gen_lr)
                    l_gen0_adv, l_gen0_cond, l_gen0_ent = train_batch_gen(0.0, meanimg, batchy, batchy_1hot, self.args.gen_lr)

                d_mix_weights, d_entloss, g_mix_weights, g_entloss = mix_weights(0.0)


                ''' store log information '''
                # logs['loss_gen1_adv'].append(l_gen1_adv)
                # logs['loss_gen1_cond'].append(l_gen1_cond)
                # logs['loss_gen1_ent'].append(l_gen1_ent)
                # logs['loss_disc1_class'].append(l_disc1_class)
                # logs['var_gen1'].append(np.var(np.array(g1)))
                # logs['var_real1'].append(np.var(np.array(r1)))

                logs['loss_gen0_adv'].append(l_gen0_adv)
                logs['loss_gen0_cond'].append(l_gen0_cond)
                logs['loss_gen0_ent'].append(l_gen0_ent)
                logs['loss_disc0_class'].append(l_disc0_class)
                #logs['var_gen0'].append(np.var(np.array(g0)))
                #logs['var_real0'].append(np.var(np.array(r0)))
                
                print("---Epoch %d, time = %ds" % (epoch, time.time()-begin))
                print("D_weights=[%.6f, %.6f, %.6f, %.6f, %.6f] loss = %0.6f" % (d_mix_weights[0,0], d_mix_weights[0,1], d_mix_weights[0,2], d_mix_weights[0,3], d_mix_weights[0,4], d_entloss))
                print("G_weights=[%.6f, %.6f, %.6f, %.6f, %.6f] loss = %0.6f" % (g_mix_weights[0,0], g_mix_weights[0,1], g_mix_weights[0,2], g_mix_weights[0,3], g_mix_weights[0,4], g_entloss))
                #print("G_weights=[%.6f]" % (g_mix_weights[0,0]))
                print("loss_disc0_adv = %.4f, loss_gen0_adv = %.4f,  loss_gen0_cond = %.4f, loss_gen0_ent = %.4f, loss_disc0_class = %.4f" % (l_disc0_adv, l_gen0_adv, l_gen0_cond, l_gen0_ent, l_disc0_class))
            # ''' sample images by stacking all generators'''
            # imgs = samplefun(meanimg, refy_1hot)
            # imgs = np.transpose(np.reshape(imgs[:100,], (100, 3, 32, 32)), (0, 2, 3, 1))
            # imgs = [imgs[i] for i in range(100)]
            # rows = []
            # for i in range(10):
            #     rows.append(np.concatenate(imgs[i::10], 1))
            # imgs = np.concatenate(rows, 0)
            # scipy.misc.imsave(self.args.out_dir + "/mnist_sample_epoch{}.png".format(epoch), imgs)

            """
            ''' original images in the training set'''
            orix = np.transpose(np.reshape(batchx[:100,], (100, 3, 32, 32)), (0, 2, 3, 1))
            orix = [orix[i] for i in range(100)]
            rows = []
            for i in range(10):
                rows.append(np.concatenate(orix[i::10], 1))
            orix = np.concatenate(rows, 0)
            scipy.misc.imsave(self.args.out_dir + "/mnist_ori_epoch{}.png".format(epoch), orix)
            """

            if epoch%self.args.save_interval==0:
                # np.savez(self.args.out_dir + "/disc1_params_epoch{}.npz".format(epoch), *LL.get_all_param_values(disc1_layers[-1]))
                # np.savez(self.args.out_dir + '/gen1_params_epoch{}.npz'.format(epoch), *LL.get_all_param_values(gen1_layers[-1]))
                #np.savez(self.args.out_dir + "/disc0_params_epoch{}.npz".format(epoch), *LL.get_all_param_values(disc0_layers))
                #np.savez(self.args.out_dir + '/gen0_params_epoch{}.npz'.format(epoch), *LL.get_all_param_values(gen0_layers))
                np.savez(self.args.out_dir + '/Dweights_params_epoch{}.npz'.format(epoch), *LL.get_all_param_values(self.D_weights_layer))
                np.savez(self.args.out_dir + '/Gweights_params_epoch{}.npz'.format(epoch), *LL.get_all_param_values(self.G_weights_layer))
                for i in range(self.args.ng):
                    np.savez(self.args.out_dir + ("/disc%d_params_epoch%d.npz" % (i,epoch)), *LL.get_all_param_values(self.D_layers[i]))
                    np.savez(self.args.out_dir + ("/gen%d_params_epoch%d.npz" % (i,epoch)), *LL.get_all_param_values(self.G_layers[i]))
                np.save(self.args.out_dir + '/logs.npy',logs)

            ''' reconstruct images '''
            reconx = reconfun(meanimg, batchy_1hot) + meanimg
            width = np.round(np.sqrt(self.args.batch_size)).astype(int)
            for i in range(self.args.ng):
                reconx_i = np.transpose(np.reshape(reconx[i*self.args.batch_size:(i+1)*self.args.batch_size], (self.args.batch_size, 3, 32, 32)), (0, 2, 3, 1))
                reconx_i = [reconx_i[j] for j in range(self.args.batch_size)]
                rows = []
                for j in range(width):
                    rows.append(np.concatenate(reconx_i[j::width], 1))
                reconx_i = np.concatenate(rows, 0)
                scipy.misc.imsave(self.args.out_dir + ("/cifar_recon_%d_epoch%d.png"%(i,epoch)), reconx_i) 
    def __init__(self, train_list_raw, test_list_raw, png_folder, batch_size,
                 dropout, l2, mode, batch_norm, **kwargs):

        print("==> not used params in DMN class:", kwargs.keys())
        self.train_list_raw = train_list_raw
        self.test_list_raw = test_list_raw
        self.png_folder = png_folder
        self.batch_size = batch_size
        self.dropout = dropout
        self.l2 = l2
        self.mode = mode
        self.batch_norm = batch_norm

        self.input_var = T.tensor4('input_var')
        self.answer_var = T.ivector('answer_var')

        print("==> building network")
        example = np.random.uniform(size=(self.batch_size, 1, 128, 768),
                                    low=0.0,
                                    high=1.0).astype(np.float32)  #########
        answer = np.random.randint(low=0, high=176,
                                   size=(self.batch_size, ))  #########

        network = layers.InputLayer(shape=(None, 1, 128, 768),
                                    input_var=self.input_var)
        print(layers.get_output(network).eval({self.input_var: example}).shape)

        # CONV-RELU-POOL 1
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=16,
                                     filter_size=(7, 7),
                                     stride=1,
                                     nonlinearity=rectify)
        print(layers.get_output(network).eval({self.input_var: example}).shape)
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=2,
                                        pad=2)
        print(layers.get_output(network).eval({self.input_var: example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # CONV-RELU-POOL 2
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=32,
                                     filter_size=(5, 5),
                                     stride=1,
                                     nonlinearity=rectify)
        print(layers.get_output(network).eval({self.input_var: example}).shape)
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=2,
                                        pad=2)
        print(layers.get_output(network).eval({self.input_var: example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # CONV-RELU-POOL 3
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=32,
                                     filter_size=(3, 3),
                                     stride=1,
                                     nonlinearity=rectify)
        print(layers.get_output(network).eval({self.input_var: example}).shape)
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=2,
                                        pad=2)
        print(layers.get_output(network).eval({self.input_var: example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # CONV-RELU-POOL 4
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=64,
                                     filter_size=(3, 3),
                                     stride=1,
                                     nonlinearity=rectify)
        print(layers.get_output(network).eval({self.input_var: example}).shape)
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=2,
                                        pad=2)
        print(layers.get_output(network).eval({self.input_var: example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # CONV-RELU-POOL 5
        network = layers.Conv2DLayer(incoming=network,
                                     num_filters=64,
                                     filter_size=(3, 3),
                                     stride=1,
                                     nonlinearity=rectify)
        print(layers.get_output(network).eval({self.input_var: example}).shape)
        network = layers.MaxPool2DLayer(incoming=network,
                                        pool_size=(3, 3),
                                        stride=2,
                                        pad=2)
        print(layers.get_output(network).eval({self.input_var: example}).shape)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)

        # DENSE 1
        #network = layers.DenseLayer(incoming=network, num_units=256, nonlinearity=rectify)
        network = layers.DenseLayer(incoming=network,
                                    num_units=6144,
                                    nonlinearity=rectify)
        if (self.batch_norm):
            network = layers.BatchNormLayer(incoming=network)
        if (self.dropout > 0):
            network = layers.dropout(network, self.dropout)
        print(layers.get_output(network).eval({self.input_var: example}).shape)

        # Last layer: classification
        network = layers.DenseLayer(incoming=network,
                                    num_units=176,
                                    nonlinearity=softmax)
        print(layers.get_output(network).eval({self.input_var: example}).shape)

        self.params = layers.get_all_params(network, trainable=True)
        self.prediction = layers.get_output(network)
        self.test_prediction = layers.get_output(network, deterministic=True)

        print("==> param shapes", [x.eval().shape for x in self.params])

        def get_loss(prediction):
            loss_ce = lasagne.objectives.categorical_crossentropy(
                prediction, self.answer_var).mean()
            if (self.l2 > 0):
                loss_l2 = self.l2 * lasagne.regularization.regularize_network_params(
                    network, lasagne.regularization.l2)
            else:
                loss_l2 = 0
            return loss_ce + loss_l2

        self.loss = get_loss(self.prediction)
        self.test_loss = get_loss(self.test_prediction)

        #updates = lasagne.updates.adadelta(self.loss, self.params)
        updates = lasagne.updates.momentum(self.loss,
                                           self.params,
                                           learning_rate=0.003)

        if self.mode == 'train':
            print("==> compiling train_fn")
            self.train_fn = theano.function(
                inputs=[self.input_var, self.answer_var],
                outputs=[self.prediction, self.loss],
                updates=updates)

        print("==> compiling test_fn")
        # deterministic version
        #self.test_fn = theano.function(inputs=[self.input_var, self.answer_var],
        #                               outputs=[self.test_prediction, self.test_loss])

        # non deterministic version, as train_fn
        self.test_fn = theano.function(
            inputs=[self.input_var, self.answer_var],
            outputs=[self.prediction, self.loss])
#compiling theano functions:
evaluate_generator = theano.function([z_var],
                                     get_output(generator),
                                     allow_input_downcast=True)

sample_generator = theano.function(
    [batchsize_var],
    samples_from_grenerator,
    allow_input_downcast=True,
)

sample_prior = theano.function([prior_variance_var, batchsize_var],
                               samples_from_prior,
                               allow_input_downcast=True)

params_D = get_all_params(discriminator['norm'], trainable=True)

updates_D = adam(loss_D, params_D, learning_rate=learningrate_var)

train_D = theano.function(
    [learningrate_var, batchsize_var, prior_variance_var],
    loss_D,
    updates=updates_D,
    allow_input_downcast=True)

params_G = get_all_params(generator, trainable=True)

updates_G = adam(loss_G, params_G, learning_rate=learningrate_var)

train_G = theano.function([x_var, y_var, learningrate_var, batchsize_var],
                          loss_G,
Beispiel #53
0
# target theano variable indicatind the index a vertex should be mapped to wrt the latent space
target = T.ivector('idxs')

# to work with logit predictions, better behaved numerically
cla = utils_lasagne.categorical_crossentropy_logdomain(output, target,
                                                       nclasses).mean()
acc = LO.categorical_accuracy(pred, target).mean()

# a bit of regularization is commonly used
regL2 = L.regularization.regularize_network_params(ffn, L.regularization.l2)

cost = cla + l2_weight * regL2
''' Define the update rule, how to train '''

params = LL.get_all_params(ffn, trainable=True)
grads = T.grad(cost, params)
# computes the L2 norm of the gradient to better inspect training
grads_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grads]), 2)

# Adam turned out to be a very good choice for correspondence
updates = L.updates.adam(grads, params, learning_rate=0.001)
''' Compile '''

funcs = dict()
funcs['train'] = theano.function(
    [inp.input_var, patch_op.input_var, target],
    [cost, cla, l2_weight * regL2, grads_norm, acc],
    updates=updates,
    on_unused_input='warn')
funcs['acc_loss'] = theano.function(
Beispiel #54
0
def main():
    parser = argparse.ArgumentParser(
        description='Tuning with bi-directional LSTM-CNN-CRF')
    parser.add_argument('--fine_tune',
                        action='store_true',
                        help='Fine tune the word embeddings')
    parser.add_argument(
        '--embedding',
        choices=['word2vec', 'glove', 'senna', 'random', 'polyglot'],
        help='Embedding for words',
        required=True)
    parser.add_argument('--embedding_dict',
                        default=None,
                        help='path for embedding dict')
    parser.add_argument('--batch_size',
                        type=int,
                        default=10,
                        help='Number of sentences in each batch')
    parser.add_argument('--num_units',
                        type=int,
                        default=100,
                        help='Number of hidden units in LSTM')
    parser.add_argument('--num_filters',
                        type=int,
                        default=20,
                        help='Number of filters in CNN')
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help='Learning rate')
    parser.add_argument('--decay_rate',
                        type=float,
                        default=0.1,
                        help='Decay rate of learning rate')
    parser.add_argument('--grad_clipping',
                        type=float,
                        default=0,
                        help='Gradient clipping')
    parser.add_argument('--gamma',
                        type=float,
                        default=1e-6,
                        help='weight for regularization')
    parser.add_argument('--peepholes',
                        action='store_true',
                        help='Peepholes for LSTM')
    parser.add_argument('--oov',
                        choices=['random', 'embedding'],
                        help='Embedding for oov word',
                        required=True)
    parser.add_argument(
        '--update',
        choices=['sgd', 'momentum', 'nesterov', 'adadelta', 'adam'],
        help='update algorithm',
        default='sgd')
    parser.add_argument('--regular',
                        choices=['none', 'l2'],
                        help='regularization for training',
                        required=True)
    parser.add_argument('--dropout',
                        action='store_true',
                        help='Apply dropout layers')
    parser.add_argument('--patience',
                        type=int,
                        default=5,
                        help='Patience for early stopping')
    parser.add_argument('--output_prediction',
                        action='store_true',
                        help='Output predictions to temp files')
    parser.add_argument('--train')
    parser.add_argument('--dev')
    parser.add_argument('--test')
    parser.add_argument('--exp_dir')
    parser.add_argument('--adv', type=float, default=0)
    parser.add_argument('--seed', type=int, default=1234)
    parser.add_argument('--reload', default=None, help='path for reloading')

    args = parser.parse_args()
    np.random.seed(args.seed)
    lasagne.random.set_rng(np.random)

    def construct_input_layer():
        if fine_tune:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length),
                                                    input_var=input_var,
                                                    name='input')
            layer_embedding = Normalized_EmbeddingLayer(
                layer_input,
                input_size=alphabet_size,
                output_size=embedd_dim,
                vocab_freqs=word_freqs,
                W=embedd_table,
                name='embedding')
            raw_layer = layer_embedding
        else:
            layer_input = lasagne.layers.InputLayer(shape=(None, max_length,
                                                           embedd_dim),
                                                    input_var=input_var,
                                                    name='input')
            raw_layer = layer_input

        return raw_layer  # [batch, max_sent_length, embedd_dim]

    def construct_char_input_layer():
        layer_char_input = lasagne.layers.InputLayer(shape=(None,
                                                            max_sent_length,
                                                            max_char_length),
                                                     input_var=char_input_var,
                                                     name='char-input')
        layer_char_input = lasagne.layers.reshape(
            layer_char_input,
            (-1, [2]))  # [batch * max_sent_length, max_char_length]
        layer_char_embedding = Normalized_EmbeddingLayer(
            layer_char_input,
            input_size=char_alphabet_size,
            output_size=char_embedd_dim,
            vocab_freqs=char_freqs,
            W=char_embedd_table,
            name='char_embedding'
        )  # [n_examples, max_char_length, char_embedd_dim]

        #layer_char_input = lasagne.layers.DimshuffleLayer(layer_char_embedding, pattern=(0, 2, 1)) # [n_examples, char_embedd_dim, max_char_length]
        return layer_char_embedding

    logger = utils.get_logger("BiLSTM-BiLSTM-CRF")
    fine_tune = args.fine_tune
    oov = args.oov
    regular = args.regular
    embedding = args.embedding
    embedding_path = args.embedding_dict
    train_path = args.train
    dev_path = args.dev
    test_path = args.test
    update_algo = args.update
    grad_clipping = args.grad_clipping
    peepholes = args.peepholes
    gamma = args.gamma
    output_predict = args.output_prediction
    dropout = args.dropout

    exp_dir = args.exp_dir
    if not os.path.isdir(exp_dir): os.mkdir(exp_dir)
    exp_name = exp_dir.split('/')[-1]
    exp_mode = exp_name.split('_')[0]  # 'pos' or 'ner', etc.

    save_dir = os.path.join(exp_dir, 'save')
    eval_dir = os.path.join(exp_dir, 'eval')
    if not os.path.isdir(save_dir): os.mkdir(save_dir)
    if not os.path.isdir(eval_dir): os.mkdir(eval_dir)
    eval_script = "./conlleval"

    if exp_mode == 'pos':
        (word_col_in_data, label_col_in_data) = (0, 1)
    elif exp_mode == 'ner':
        (word_col_in_data, label_col_in_data) = (0, 3)
    elif exp_mode == 'chunk':
        (word_col_in_data, label_col_in_data) = (0, 2)
    else:
        (word_col_in_data, label_col_in_data) = (1, 3)  # assume CoNLL-U style

    # load data
    X_train, Y_train, mask_train, X_dev, Y_dev, mask_dev, X_test, Y_test, mask_test, \
    (embedd_table, word_freqs), label_alphabet, \
    C_train, C_dev, C_test, (char_embedd_table, char_freqs) = data_processor.load_dataset_sequence_labeling(train_path, dev_path,
                test_path, word_col_in_data, label_col_in_data,
                label_name=exp_mode, oov=oov,
                fine_tune=True,
                embedding=embedding, embedding_path=embedding_path,
                use_character=True)
    num_labels = label_alphabet.size() - 1

    logger.info("constructing network...")
    # create variables
    target_var = T.imatrix(name='targets')
    mask_var = T.matrix(name='masks', dtype=theano.config.floatX)
    num_tokens = mask_var.sum(dtype=theano.config.floatX)
    if fine_tune:
        input_var = T.imatrix(name='inputs')
        num_data, max_length = X_train.shape
        alphabet_size, embedd_dim = embedd_table.shape
    else:
        input_var = T.tensor3(name='inputs', dtype=theano.config.floatX)
        num_data, max_length, embedd_dim = X_train.shape
    char_input_var = T.itensor3(name='char-inputs')
    num_data_char, max_sent_length, max_char_length = C_train.shape
    char_alphabet_size, char_embedd_dim = char_embedd_table.shape
    assert (max_length == max_sent_length)
    assert (num_data == num_data_char)

    # prepare initial input layer and embeddings
    char_layer = construct_char_input_layer()
    word_layer = construct_input_layer()
    char_emb = Lyrs.get_output(char_layer)
    word_emb = Lyrs.get_output(word_layer)

    # construct input and mask layers
    char_in_layer = Lyrs.InputLayer(shape=(None, max_char_length,
                                           char_embedd_dim))
    word_in_layer = Lyrs.InputLayer(shape=(None, max_length, embedd_dim))

    layer_mask = lasagne.layers.InputLayer(shape=(None, max_length),
                                           input_var=mask_var,
                                           name='mask')

    # construct bilstm_bilstm_crf
    num_units = args.num_units
    num_filters = args.num_filters
    logger.info("Network structure: hidden=%d, filter=%d" %
                (num_units, num_filters))

    bilstm_bilstm_crf = build_BiLSTM_BiLSTM_CRF(char_in_layer,
                                                word_in_layer,
                                                num_units,
                                                num_labels,
                                                mask=layer_mask,
                                                grad_clipping=grad_clipping,
                                                peepholes=peepholes,
                                                num_filters=num_filters,
                                                dropout=dropout)

    # compute loss
    def loss_from_embedding(char_emb,
                            word_emb,
                            deterministic=False,
                            return_all=True):
        # get outpout of bi-lstm-cnn-crf shape [batch, length, num_labels, num_labels]
        energies = Lyrs.get_output(bilstm_bilstm_crf,
                                   inputs={
                                       char_in_layer: char_emb,
                                       word_in_layer: word_emb
                                   },
                                   deterministic=deterministic)
        loss = crf_loss(energies, target_var, mask_var).mean()
        if return_all:
            predict, corr = crf_accuracy(energies, target_var)
            corr = (corr * mask_var).sum(dtype=theano.config.floatX)
            return loss, predict, corr
        else:
            return loss

    loss_eval, prediction_eval, corr_eval = loss_from_embedding(
        char_emb, word_emb, deterministic=True)
    loss_train_ori, _, corr_train = loss_from_embedding(char_emb, word_emb)

    if args.adv:
        logger.info('Preparing adversarial training...')
        loss_train_adv = adversarial_loss(char_emb,
                                          word_emb,
                                          loss_from_embedding,
                                          loss_train_ori,
                                          perturb_scale=args.adv)
        loss_train = (loss_train_ori + loss_train_adv) / 2.0
    else:
        loss_train_adv = T.as_tensor_variable(
            np.asarray(0.0, dtype=theano.config.floatX))
        loss_train = loss_train_ori + loss_train_adv

    # l2 regularization?
    if regular == 'l2':
        l2_penalty = lasagne.regularization.regularize_network_params(
            bilstm_bilstm_crf, lasagne.regularization.l2)
        loss_train = loss_train + gamma * l2_penalty

    # Create update expressions for training.
    # hyper parameters to tune: learning rate, momentum, regularization.
    batch_size = args.batch_size
    learning_rate = 1.0 if update_algo == 'adadelta' else args.learning_rate
    decay_rate = args.decay_rate
    momentum = 0.9
    params = Lyrs.get_all_params(
        bilstm_bilstm_crf, trainable=True) + Lyrs.get_all_params(
            char_layer, trainable=True) + Lyrs.get_all_params(word_layer,
                                                              trainable=True)
    updates = utils.create_updates(loss_train,
                                   params,
                                   update_algo,
                                   learning_rate,
                                   momentum=momentum)

    # Compile a function performing a training step on a mini-batch
    train_fn = theano.function(
        [input_var, target_var, mask_var, char_input_var],
        [loss_train_ori, loss_train_adv, corr_train, num_tokens],
        updates=updates)
    # Compile a second function evaluating the loss and accuracy of network
    eval_fn = theano.function(
        [input_var, target_var, mask_var, char_input_var],
        [loss_eval, corr_eval, num_tokens, prediction_eval])

    # reload saved model
    if args.reload is not None:
        logger.info('Reloading saved parameters from %s ...\n' % args.reload)
        with np.load(args.reload) as f:
            param_values = [f['arr_%d' % j] for j in range(len(f.files))]
        Lyrs.set_all_param_values(word_layer, param_values[0:1])
        Lyrs.set_all_param_values(char_layer, param_values[1:2])
        Lyrs.set_all_param_values(bilstm_bilstm_crf, param_values[2:])

    # Finally, launch the training loop.
    logger.info(
        "Start training: %s with regularization: %s(%f), dropout: %s, fine tune: %s (#training data: %d, batch size: %d, clip: %.1f, peepholes: %s) ..." \
        % (
            update_algo, regular, (0.0 if regular == 'none' else gamma), dropout, fine_tune, num_data, batch_size,
            grad_clipping,
            peepholes))
    num_batches = num_data / batch_size
    num_epochs = 1000
    best_acc = np.array([0.0, 0.0, 0.0])
    best_epoch_acc = np.array([0, 0, 0])
    best_acc_test_err = np.array([0.0, 0.0, 0.0])
    best_acc_test_corr = np.array([0.0, 0.0, 0.0])
    stop_count = 0
    lr = learning_rate
    patience = args.patience
    for epoch in range(1, num_epochs + 1):
        print
        print 'Epoch %d (learning rate=%.7f, decay rate=%.4f): ' % (epoch, lr,
                                                                    decay_rate)
        train_err_ori = 0.0
        train_err_adv = 0.0
        train_corr = 0.0
        train_total = 0
        train_inst = 0
        start_time = time.time()
        num_back = 0
        train_batches = 0

        epoch_save_dir = os.path.join(save_dir, 'epoch%d' % epoch)
        os.mkdir(epoch_save_dir)

        for batch in utils.iterate_minibatches(X_train,
                                               Y_train,
                                               masks=mask_train,
                                               char_inputs=C_train,
                                               batch_size=batch_size,
                                               shuffle=True):
            inputs, targets, masks, char_inputs = batch
            err_ori, err_adv, corr, num = train_fn(inputs, targets, masks,
                                                   char_inputs)
            train_err_ori += err_ori * inputs.shape[0]
            train_err_adv += err_adv * inputs.shape[0]
            train_corr += corr
            train_total += num
            train_inst += inputs.shape[0]
            train_batches += 1
            time_ave = (time.time() - start_time) / train_batches
            time_left = (num_batches - train_batches) * time_ave

            # update log
            if train_batches % (num_batches // 10) == 0:
                log_info = 'train: %d/%d L_ori: %.4f, L_adv: %.4f, acc: %.2f%%, time left: %.2fs\n' % (
                    min(train_batches * batch_size, num_data), num_data,
                    train_err_ori / train_inst, train_err_adv / train_inst,
                    train_corr * 100 / train_total, time_left)
                sys.stdout.write(log_info)
                sys.stdout.flush()

                # save the parameter values
                #param_values = Lyrs.get_all_param_values(bilstm_bilstm_crf)
                #np.savez(epoch_save_dir + '/iter%d.npz' % train_batches, *param_values)

        # save the parameter values
        param_values = Lyrs.get_all_param_values(
            word_layer) + Lyrs.get_all_param_values(
                char_layer) + Lyrs.get_all_param_values(bilstm_bilstm_crf)
        np.savez(epoch_save_dir + '/final.npz', *param_values)

        # update training log after each epoch
        assert train_inst == num_data
        print 'train: %d/%d L_ori: %.4f, L_adv: %.4f, acc: %.2f%%, time: %.2fs' % (
            min(train_batches * batch_size, num_data), num_data,
            train_err_ori / train_inst, train_err_adv / train_inst,
            train_corr * 100 / train_total, time.time() - start_time)

        # evaluate performance on dev data
        dev_err = 0.0
        dev_corr = 0.0
        dev_total = 0
        dev_inst = 0
        for batch in utils.iterate_minibatches(X_dev,
                                               Y_dev,
                                               masks=mask_dev,
                                               char_inputs=C_dev,
                                               batch_size=batch_size):
            inputs, targets, masks, char_inputs = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks,
                                                  char_inputs)
            dev_err += err * inputs.shape[0]
            dev_corr += corr
            dev_total += num
            dev_inst += inputs.shape[0]
            if output_predict:
                output_file = eval_dir + '/dev%d' % epoch
                utils.output_predictions(predictions,
                                         targets,
                                         masks,
                                         output_file,
                                         label_alphabet,
                                         is_flattened=False)

        print 'dev loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            dev_err / dev_inst, dev_corr, dev_total,
            dev_corr * 100 / dev_total)

        #update_loss = False
        update_acc = False
        if best_acc.min() > dev_corr / dev_total:
            stop_count += 1
        else:
            stop_count = 0
            if best_acc.min() < dev_corr / dev_total:
                update_acc = True
                idx_to_update = best_acc.argmin()
                best_acc[idx_to_update] = dev_corr / dev_total
                best_epoch_acc[idx_to_update] = epoch

        # evaluate on test data
        test_err = 0.0
        test_corr = 0.0
        test_total = 0
        test_inst = 0
        for batch in utils.iterate_minibatches(X_test,
                                               Y_test,
                                               masks=mask_test,
                                               char_inputs=C_test,
                                               batch_size=batch_size):
            inputs, targets, masks, char_inputs = batch
            err, corr, num, predictions = eval_fn(inputs, targets, masks,
                                                  char_inputs)
            test_err += err * inputs.shape[0]
            test_corr += corr
            test_total += num
            test_inst += inputs.shape[0]
            if output_predict:
                output_file = eval_dir + '/test%d' % epoch
                utils.output_predictions(predictions,
                                         targets,
                                         masks,
                                         output_file,
                                         label_alphabet,
                                         is_flattened=False)

        # print out test result
        if stop_count > 0:
            print '(cf.',
        print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            test_err / test_inst, test_corr, test_total,
            test_corr * 100 / test_total),
        if output_predict and exp_mode in ['ner', 'chunk']:
            stdout = subprocess.check_output([eval_script],
                                             stdin=open(output_file))
            f1_score = stdout.split("\n")[1].split()[7]  # this is string
            print ", f1:", f1_score
        else:
            print
        sys.stdout.flush()

        if update_acc:
            best_acc_test_err[idx_to_update] = test_err
            best_acc_test_corr[idx_to_update] = test_corr

        # stop if dev acc decrease 3 time straightly.
        if stop_count == patience:
            break

        # re-compile a function with new learning rate for training
        if update_algo not in ['adam', 'adadelta']:
            if decay_rate >= 0:
                lr = learning_rate / (1.0 + epoch * decay_rate)
            else:
                if stop_count > 0 and stop_count % 3 == 0:
                    learning_rate /= 2.0
                    lr = learning_rate
            updates = utils.create_updates(loss_train,
                                           params,
                                           update_algo,
                                           lr,
                                           momentum=momentum)
            train_fn = theano.function(
                [input_var, target_var, mask_var, char_input_var],
                [loss_train_ori, loss_train_adv, corr_train, num_tokens],
                updates=updates)

    # print best performance on test data.
    for i in range(len(best_epoch_acc)):
        logger.info("final best acc test performance (at epoch %d)" %
                    best_epoch_acc[i])
        print 'test loss: %.4f, corr: %d, total: %d, acc: %.2f%%' % (
            best_acc_test_err[i] / test_inst, best_acc_test_corr[i],
            test_total, best_acc_test_corr[i] * 100 / test_total)
Beispiel #55
0
    def build_treatment_model(self, n_vars, **kwargs):

        input_vars = TT.matrix()
        instrument_vars = TT.matrix()
        targets = TT.vector()

        inputs = layers.InputLayer((None, n_vars), input_vars)
        inputs = layers.DropoutLayer(inputs, p=0.2)

        dense_layer = layers.DenseLayer(inputs,
                                        2 * kwargs['dense_size'],
                                        nonlinearity=nonlinearities.rectify)
        dense_layer = layers.batch_norm(dense_layer)
        dense_layer = layers.DropoutLayer(dense_layer, p=0.2)

        for _ in xrange(kwargs['n_dense_layers'] - 1):
            dense_layer = layers.DenseLayer(
                dense_layer,
                kwargs['dense_size'],
                nonlinearity=nonlinearities.rectify)
            dense_layer = layers.batch_norm(dense_layer)

        self.treatment_output = layers.DenseLayer(
            dense_layer, 1, nonlinearity=nonlinearities.linear)
        init_params = layers.get_all_param_values(self.treatment_output)

        prediction = layers.get_output(self.treatment_output,
                                       deterministic=False)
        test_prediction = layers.get_output(self.treatment_output,
                                            deterministic=True)

        l2_cost = regularization.regularize_network_params(
            self.treatment_output, regularization.l2)
        loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost

        params = layers.get_all_params(self.treatment_output, trainable=True)
        param_updates = updates.adadelta(loss, params)

        self._train_fn = theano.function([
            input_vars,
            targets,
            instrument_vars,
        ],
                                         loss,
                                         updates=param_updates)

        self._loss_fn = theano.function(
            [
                input_vars,
                targets,
                instrument_vars,
            ],
            loss,
        )

        self._output_fn = theano.function(
            [
                input_vars,
            ],
            test_prediction,
        )

        return init_params
Beispiel #56
0
 def params(self):
     return get_all_params(self.l_out, trainable=True)
    def build_graph(self):
        print 'building models...'
        y_gold = T.lvector('y_gold')  # index of the correct action from oracle
        z_gold = T.lvector('z_gold')  # index of the correct label from oracle
        sidx = T.lvector('sidx')  # sentence ids of the batch
        tidx = T.lmatrix('tidx')  # token ids in each sentence of the batch
        valid = T.fmatrix('valid')  # valid action mask

        self.step = theano.shared(np.array(0.).astype(theano.config.floatX),
                                  name='step')

        lr = self.args.learn_rate * self.args.decay**T.floor(self.step / 2000.)

        self.actor, self.labeler, self.taggers = self.get_actor(sidx,
                                                                tidx,
                                                                valid,
                                                                avg=False)
        self.actor_avg, self.labeler_avg, self.taggers_avg = self.get_actor(
            sidx, tidx, valid, avg=True)

        # averaged model for prediction
        actor_prob, labeler_prob = L.get_output(
            [self.actor_avg, self.labeler_avg], deterministic=True)
        actor_rest = actor_prob * valid  # mask the probabilities of invalid actions to 0
        actor_rest_normalized = actor_rest / T.sum(
            actor_rest, axis=1, keepdims=True)

        preds_avg = []
        if self.args.aux_tagger:
            for name in self.args.target_feats:
                prob_avg = L.get_output(self.taggers_avg[name],
                                        deterministic=True)  # (100, 25)
                pred_avg = T.argmax(prob_avg, axis=1)  # (100, )
                preds_avg.append(pred_avg)

        self.actor_predict_avg = theano.function(
            [sidx, tidx, valid],
            [actor_rest_normalized, labeler_prob] + preds_avg,
            on_unused_input='ignore',
            allow_input_downcast=True)

        # training
        # only compile if in training mode (has training data)
        if self.args.train:
            # parser objectives
            y_prob, z_prob = L.get_output([self.actor, self.labeler],
                                          deterministic=False)
            y_xent = categorical_crossentropy(y_prob, y_gold)
            z_xent = categorical_crossentropy(z_prob, z_gold)

            y_pred = T.argmax(y_prob, 1)
            z_pred = T.argmax(z_prob, 1)
            z_mask = T.eq(y_pred, y_gold) & T.lt(y_gold, self.args.idsh)

            acc_y = T.mean(T.cast(T.eq(y_pred, y_gold), theano.config.floatX))
            acc_z = T.cast(T.sum(T.eq(z_pred, z_gold) * z_mask) + 1., theano.config.floatX)\
                        / T.cast(T.sum(z_mask) + 1., theano.config.floatX)

            cost = T.mean(y_xent) + T.mean(z_xent * z_mask)

            params = L.get_all_params([self.actor, self.labeler] +
                                      self.taggers.values(),
                                      trainable='True')
            avg_params = L.get_all_params([self.actor_avg, self.labeler_avg] +
                                          self.taggers_avg.values(),
                                          trainable='True')

            # accuracy of all auxiliary tasks
            acc_w = acc_y - acc_y
            # joint objective for aux tagger
            if self.args.aux_tagger:
                # tags of s0 are the targets
                for name in self.args.target_feats:
                    w_gold = self.manager.feats[name].data[
                        sidx.dimshuffle(0, 'x'), tidx][:, 0]  # (100, )
                    w_prob = L.get_output(self.taggers[name],
                                          deterministic=False)
                    w_xent = categorical_crossentropy(w_prob, w_gold)
                    w_mask = T.neq(w_gold, 0)
                    cost += self.args.aux_ratio * T.mean(w_xent * w_mask)

                    w_pred = T.argmax(w_prob, axis=1)
                    acc = T.cast(T.sum(T.eq(w_pred, w_gold) * w_mask) + 1., theano.config.floatX)\
                            / T.cast(T.sum(w_mask) + 1., theano.config.floatX)
                    acc_w += acc / len(self.args.target_feats)

            reg = regularize_network_params(
                L.get_all_layers([self.actor, self.labeler] +
                                 self.taggers.values()), l2)
            cost += self.args.reg_rate * reg

            updates = lasagne.updates.momentum(cost, params, lr,
                                               self.args.momentum)
            updates = apply_moving_average(params, avg_params, updates,
                                           self.step, 0.9999)

            self.train_parser = theano.function(
                [y_gold, z_gold, sidx, tidx, valid],
                [acc_y, acc_z, acc_w, cost],
                updates=updates,
                on_unused_input='ignore',
                allow_input_downcast=True)
Beispiel #58
0
##### On passing real images to the discriminator
D_loss_real = sigmoid_cross_entropy_with_logits_v1(D_real, 1)
##### On passing fake images to the discriminator
D_loss_fake = sigmoid_cross_entropy_with_logits_v1(D_fake, 0)

#### Taking mean of the two
D_loss = D_loss_real.mean() + D_loss_fake.mean()

#Generator Loss (Discriminator is frozen i.e. not updated)

#### On passing fake images to the discriminator
G_loss_fake = sigmoid_cross_entropy_with_logits_v1(D_fake, 1)
G_loss = G_loss_fake.mean()

#Finding Paramters for training
D_theta = LL.get_all_params(dis_layers[-1], trainable=True)
G_theta = LL.get_all_params(gen_layers[-1], trainable=True)

#Updating the paramteres with Adam Optimizer with default parameters
D_solver = lasagne.updates.adam(D_loss, D_theta)
G_solver = lasagne.updates.adam(G_loss, G_theta)

##### Writing Training Functions for the both the networks
D_train_fn = th.function(inputs=[Dis_input, Gen_input, input_labels],
                         outputs=[D_loss],
                         updates=D_solver)
G_train_fn = th.function(inputs=[Gen_input, input_labels],
                         outputs=[G_loss],
                         updates=G_solver)

i = 0
Beispiel #59
0
    input_data = T.ftensor3('input_data')
    input_mask = T.fmatrix('input_mask')
    target_data = T.imatrix('target_data')
    target_mask = T.fmatrix('target_mask')
    network_output = deep_prj_lstm_model_v1(input_var=input_data,
                                            mask_var=input_mask,
                                            num_inputs=input_dim,
                                            num_outputs=output_dim,
                                            num_layers=args.num_layers,
                                            num_units=args.num_units,
                                            num_prjs=args.num_prjs,
                                            grad_clipping=args.grad_clipping,
                                            dropout=args.dropout)

    network = network_output
    network_params = get_all_params(network, trainable=True)
    network_reg_params = get_all_params(network,
                                        trainable=True,
                                        regularizable=True)
    param_count = count_params(network, trainable=True)
    print('Number of parameters of the network: {:.2f}M'.format(
        float(param_count) / 1000000))

    ######################
    # reload model param #
    ######################
    if args.reload_model:
        print('Loading model: {}'.format(args.reload_model))
        with open(args.reload_model, 'rb') as f:
            [
                pretrain_network_params_val, pretrain_update_params_val,
Beispiel #60
0
def train_class(ds, paths, funcs, cla, updates, param_arch, param_cost, param_updates, param_train):

    # creates a log file containing the training behaviour,
    # saves it to file
    formatter = logging.Formatter('%(asctime)s %(message)s', "%Y-%m-%d %H:%M:%S")
    logger = logging.getLogger('log_training')
    if 'start_from_epoch' in param_train:
        name_tmp = 'training_from_epoch=%04d.log' % (param_train['start_from_epoch'])
    else:
        name_tmp = 'training.log'
    path_tmp = os.path.join(paths['exp'], name_tmp)
    if not os.path.isfile(path_tmp):
        file_handler = logging.FileHandler(path_tmp, mode='w')
    else:
        raise Exception('[e] the log file file ', name_tmp, ' already exists!')
    file_handler.setFormatter(formatter)
    logger.addHandler(file_handler)
    # and shows it to screen
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)
    logger.setLevel(logging.INFO)
    logger.info("Training stats:")
    
    cost_train_avg = []
    cla_train_avg = []
    reg_train_avg = []
    grad_norm_train_avg = []
    acc_train_avg = []
    cost_test_avg = []
    grad_norm_test_avg = []
    acc_test_avg = []

    for i_ in xrange(param_train['n_epochs']):

        if 'start_from_epoch' in param_train:
            i = i_ + param_train['start_from_epoch']
        else:
            i = i_
        
        cost_train = []
        cla_train = []
        reg_train = []
        cost_test = []
        grad_norm_train = []
        grad_norm_test = []
        acc_train = []
        acc_test = []

        tic = time.time()
        
        for x in ds.train_iter():
            tmp = funcs['train'](*x)
            cost_train.append(tmp[0])
            cla_train.append(tmp[1])
            reg_train.append(tmp[2])
            grad_norm_train.append(tmp[3])
            acc_train.append(tmp[4])

        if ((i + 1) % param_train['freq_viz_train']) == 0:
            cost_train_avg.append(np.mean(cost_train))
            cla_train_avg.append(np.mean(cla_train))
            reg_train_avg.append(np.mean(reg_train))
            grad_norm_train_avg.append(np.mean(grad_norm_train))
            acc_train_avg.append(np.mean(acc_train))
            string = "[TRN] epoch = %03i, cost = %3.2e (cla = %3.2e, reg = %3.2e), |grad| = %.2e, acc = %02.2f%% (%03.2fs)" % \
                (i + 1, cost_train_avg[-1], cla_train_avg[-1], reg_train_avg[-1], grad_norm_train_avg[-1], 100.*acc_train_avg[-1], time.time() - tic)
            logger.info(string)
            
        if ((i + 1) % param_train['freq_viz_test']) == 0:

            tic = time.time()

            for x in ds.test_fwd():
                tmp = funcs['fwd'](*x)
                cost_test.append(tmp[0])
                grad_norm_test.append(tmp[1])
                acc_test.append(tmp[2])
          
            cost_test_avg.append(np.mean(cost_test))
            grad_norm_test_avg.append(np.mean(grad_norm_test))
            acc_test_avg.append(np.mean(acc_test))
            string = "[TST] epoch = %03i, cost = %3.2e, |grad| = %.2e, acc = %02.2f%% (%03.2fs)" % \
                (i + 1, cost_test_avg[-1],  grad_norm_test_avg[-1],  100.*acc_test_avg[-1],  time.time() - tic)
            logger.info(string)
            
        if param_train['flag_save_pkls']:
            if ((i + 1) % param_train['freq_save_pkls']) == 0:
                if not os.path.isdir(paths['pkls']):
                    os.makedirs(paths['pkls'])
                name_dump = "%s/epoch=%04d.pkl" % (paths['pkls'], i + 1)
                keys_net = LL.get_all_params(cla)
                values_net = LL.get_all_param_values(cla, trainable=False)
                keys_updates = [k for k in updates.keys()]
                values_updates = [k.get_value() for k in updates.keys()]
                tmp = [paths, param_arch, param_cost, param_updates, param_train,
                       cost_train_avg, acc_train_avg, cost_test_avg, acc_test_avg,
                       keys_net, values_net, keys_updates, values_updates]
                with open(name_dump, 'wb') as f:
                    cPickle.dump(tmp, f)
            
        if param_train['flag_save_preds']:
            if ((i + 1) % param_train['freq_save_preds']) == 0:
                for j, k in enumerate(ds.test_fwd()):
                    path_dump = os.path.join(paths['preds'], "epoch=%04d" % (i + 1))
                    if not os.path.isdir(path_dump):
                        os.makedirs(path_dump)
                    name_dump = os.path.join(path_dump, ds.names_test[j])
                    tmp = funcs['pred'](*k)
                    scipy.io.savemat(name_dump, {'pred': tmp[0]})
    
    return cost_train_avg, acc_train_avg, cost_test_avg, acc_test_avg