Ejemplo n.º 1
0
Archivo: model.py Proyecto: xenx/speech
    def __init__(self, sound_shape, num_units, main_layer_class, loss_func, updates_func):
        # входной тензор (кол-во батчей, кол-во записей, время, частота)
        input_X = T.tensor4("X")

        # сеть
        input_layer = InputLayer(shape=(None, 3) + sound_shape, input_var=input_X.swapaxes(2, 3))
        all_output = main_layer_class(input_layer, sound_shape, num_units)  # for loss
        vector_output = ReshapeLayer(all_output, (-1, 1, num_units))  # for use

        # предсказание нейронки
        all_predicted = get_output(all_output)  # for loss
        vector_predicted = get_output(vector_output)  # for use

        # функция ошибки
        loss = loss_func(all_predicted)

        # посчитать обновлённые веса с шагом по градиенту
        trainable_weights = get_all_params(all_output, trainable=True)
        updates_sgd = updates_func(loss, trainable_weights)

        # функция, которая обучает сеть на 1 шаг и возвращащет значение функции потерь
        self.fit = theano.function([input_X], loss, updates=updates_sgd)

        # функция, которая возвращает вектор голоса
        self.predict = theano.function([input_X], vector_predicted)

        self.all_output = all_output
        self.vector_output = vector_output
        self.all_predicted = all_predicted
        self.vector_predicted = vector_predicted
Ejemplo n.º 2
0
def create_iter_funcs_valid(l_out, bs=None, N=50, mc_dropout=False):
    X = T.tensor4('X')
    y = T.ivector('y')
    X_batch = T.tensor4('X_batch')
    y_batch = T.ivector('y_batch')

    if not mc_dropout:
        y_hat = layers.get_output(l_out, X, deterministic=True)
    else:
        if bs is None:
            raise ValueError('a fixed batch size is required for mc dropout')
        X_repeat = T.extra_ops.repeat(X, N, axis=0)
        y_sample = layers.get_output(
            l_out, X_repeat, deterministic=False)

        sizes = [X_repeat.shape[0] / X.shape[0]] * bs
        y_sample_split = T.as_tensor_variable(
            T.split(y_sample, sizes, bs, axis=0))
        y_hat = T.mean(y_sample_split, axis=1)

    valid_loss = T.mean(
        T.nnet.categorical_crossentropy(y_hat, y))
    valid_acc = T.mean(
        T.eq(y_hat.argmax(axis=1), y))

    valid_iter = theano.function(
        inputs=[theano.Param(X_batch), theano.Param(y_batch)],
        outputs=[valid_loss, valid_acc],
        givens={
            X: X_batch,
            y: y_batch,
        },
    )

    return valid_iter
Ejemplo n.º 3
0
def test_slice_layer():
    from lasagne.layers import SliceLayer, InputLayer, get_output_shape,\
        get_output
    from numpy.testing import assert_array_almost_equal as aeq
    in_shp = (3, 5, 2)
    l_inp = InputLayer(in_shp)
    l_slice_ax0 = SliceLayer(l_inp, axis=0, indices=0)
    l_slice_ax1 = SliceLayer(l_inp, axis=1, indices=slice(3, 5))
    l_slice_ax2 = SliceLayer(l_inp, axis=-1, indices=-1)

    x = np.arange(np.prod(in_shp)).reshape(in_shp).astype('float32')
    x1 = x[0]
    x2 = x[:, 3:5]
    x3 = x[:, :, -1]

    assert get_output_shape(l_slice_ax0) == x1.shape
    assert get_output_shape(l_slice_ax1) == x2.shape
    assert get_output_shape(l_slice_ax2) == x3.shape

    aeq(get_output(l_slice_ax0, x).eval(), x1)
    aeq(get_output(l_slice_ax1, x).eval(), x2)
    aeq(get_output(l_slice_ax2, x).eval(), x3)

    # test slicing None dimension
    in_shp = (2, None, 2)
    l_inp = InputLayer(in_shp)
    l_slice_ax1 = SliceLayer(l_inp, axis=1, indices=slice(3, 5))
    assert get_output_shape(l_slice_ax1) == (2, None, 2)
    aeq(get_output(l_slice_ax1, x).eval(), x2)
Ejemplo n.º 4
0
    def init_model(self):
        print('Initializing model...')
        ra_input_var = T.tensor3('raw_audio_input')
        mc_input_var = T.tensor3('melody_contour_input')
        target_var = T.imatrix('targets')
        network = self.build_network(ra_input_var, mc_input_var)
        prediction = layers.get_output(network)
        prediction = T.clip(prediction, 1e-7, 1.0 - 1e-7)
        loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
        loss = loss.mean()
        params = layers.get_all_params(network, trainable=True)
        updates = lasagne.updates.sgd(loss, params, learning_rate=0.02)

        test_prediction = layers.get_output(network, deterministic=True)
        test_loss = lasagne.objectives.categorical_crossentropy(test_prediction,
                                                                target_var)
        test_loss = test_loss.mean()
        test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), T.argmax(target_var, axis=1)),
                          dtype=theano.config.floatX)

        print('Building functions...')
        self.train_fn = theano.function([ra_input_var, mc_input_var, target_var], 
                                        [loss, prediction], 
                                        updates=updates, 
                                        on_unused_input='ignore')
        self.val_fn = theano.function([ra_input_var, mc_input_var, target_var], 
                                        [test_loss, test_acc, test_prediction], 
                                        on_unused_input='ignore')
        self.run_fn = theano.function([ra_input_var, mc_input_var],
                                        [prediction],
                                        on_unused_input='ignore')
Ejemplo n.º 5
0
def test():
    w, h, c = 8, 8, 1
    encoder, decoder = build_convnet_deep(w=w, h=h, c=c)

    encoder = layers_from_list_to_dict(encoder)
    decoder = layers_from_list_to_dict(decoder)
    print(encoder.keys(), decoder.keys())
    x = encoder["input"].input_var

    f = theano.function([x], 
            [layers.get_output(encoder["z_mean"], x),
             layers.get_output(encoder["z_log_sigma"], x)]
    )
    X = np.random.uniform(size=(1, c, w, h)).astype(np.float32)
    m, s = f(X)
    print(m.shape, s.shape)


    z = decoder["input"].input_var
    D = (decoder["input"].output_shape)[1]
    Z = np.random.uniform(size=(1, D)).astype(np.float32)
    f = theano.function([z], layers.get_output(decoder["output"], z))
    print(f(Z).shape)

    z = layers.get_output(encoder["z_mean"], x)
    f = theano.function([x], 
            layers.get_output(decoder["output"], {encoder["input"]: x, decoder["input"]: z}),
            givens={decoder["input"].input_var: z},
            on_unused_input='ignore')
    print(f(X).shape)
Ejemplo n.º 6
0
    def __init__(self, dims, nonlinearities=None, dropouts=None,
                 update_fn=None, batch_norm=False,
                 loss_type='cosine_margin', margin=0.8):
        """Initialize a Siamese neural network

        Parameters:
        -----------
        update_fn: theano function with 2 arguments (loss, params)
            Update scheme, default to adadelta
        batch_norm: bool
            Do batch normalisation on first layer, default to false
        """
        assert len(dims) >= 3, 'Not enough dimmensions'
        if dropouts != None:
            dropouts = copy.copy(dropouts)
            assert len(dropouts) == len(dims) - 1
            dropouts.append(0)
        else:
            dropouts = [0] * len(dims)
        if nonlinearities==None:
            nonlinearities = [nl.sigmoid] * (len(dims) -1)
        else:
            assert len(nonlinearities) == len(dims) - 1
        if update_fn == None:
            update_fn = lasagne.updates.adadelta
        self.input_var1 = T.matrix('inputs1')
        self.input_var2 = T.matrix('inputs2')
        self.target_var = T.ivector('targets')
        # input layer
        network1 = layers.InputLayer((None, dims[0]), input_var=self.input_var1)
        network2 = layers.InputLayer((None, dims[0]), input_var=self.input_var2)
        if dropouts[0]:
            network1 = layers.DropoutLayer(network1, p=dropouts[0])
            network2 = layers.DropoutLayer(network2, p=dropouts[0])
        # hidden layers
        for dim, dropout, nonlin in zip(dims[1:], dropouts[1:], nonlinearities):
            network1 = layers.DenseLayer(network1, num_units=dim,
                                         W=lasagne.init.GlorotUniform(),
                                         nonlinearity=nonlin)
            network2 = layers.DenseLayer(network2, num_units=dim,
                                         W=network1.W, b=network1.b,
                                         nonlinearity=nonlin)
            if batch_norm:
                network1 = layers.batch_norm(network1)
                network2 = layers.batch_norm(network2)
            if dropout:
                network1 = layers.DropoutLayer(network1, p=dropout)
                network2 = layers.DropoutLayer(network2, p=dropout)
        self.network = [network1, network2]
        self.params = layers.get_all_params(network1, trainable=True)

        # util functions, completely stolen from Lasagne example
        self.prediction1 = layers.get_output(network1)
        self.prediction2 = layers.get_output(network2)
        # if non-determnistic:
        self.test_prediction1 = layers.get_output(network1, deterministic=True)
        self.test_prediction2 = layers.get_output(network2, deterministic=True)

        self.change_loss(loss_type, margin)
        self.change_update(update_fn)
def generate_theano_func(args, network, penalty, input_dict, target_var):

    prediction = get_output(network, input_dict)

    # loss = T.mean( target_var * ( T.log(target_var) - prediction ))
    loss = T.mean(categorical_crossentropy(prediction, target_var))
    # loss += 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(network) )
    # penalty = sum ( T.sum(lstm_param**2) for lstm_param in lstm_params )
    # penalty = regularize_layer_params(l_forward_1_lstm, l2)
    # penalty = T.sum(lstm_param**2 for lstm_param in lstm_params)
    # penalty = 0.0001 * sum (T.sum(layer_params ** 2) for layer_params in get_all_params(l_forward_1) )

    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    test_prediction = get_output(network, input_dict, deterministic=True)
    # test_prediction = get_output(network, deterministic=True)
    # test_loss = T.mean( target_var * ( T.log(target_var) - test_prediction))
    test_loss = T.mean(categorical_crossentropy(test_prediction, target_var))

    train_fn = theano.function(
        [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
        loss,
        updates=updates,
        allow_input_downcast=True,
    )

    if args.task == "sts":
        val_fn = theano.function(
            [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
            [test_loss, test_prediction],
            allow_input_downcast=True,
        )

    elif args.task == "ent":
        # test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX)
        test_acc = T.mean(categorical_accuracy(test_prediction, target_var))
        val_fn = theano.function(
            [input1_var, input1_mask_var, input2_var, input2_mask_var, target_var],
            [test_loss, test_acc],
            allow_input_downcast=True,
        )

    return train_fn, val_fn
Ejemplo n.º 8
0
    def __init__(self):
        self.input_X = theano.tensor.tensor4("X")
        # self.X_reshaped = self.input_X.dimshuffle([0, 3, 1, 2])
        self.target_y = theano.tensor.vector("target Y", dtype='int32')

        # Архитектура
        in_0 = Input(shape=[None, 1, 64, 64], input_var=self.input_X)
        in_downsample = Pool(in_0, [4, 4])
        conv_0 = Conv(in_downsample, 64, (2, 2), nonlinearity=sigmoid)
        pool_0 = Pool(conv_0, (3, 3))
        self.out = Dense(pool_0, num_units=7, nonlinearity=softmax)

        # load last state(if file exists)
        self.path = "{}/{}.npz".format(os.getcwd(), self.__class__.__name__)
        if os.path.exists(self.path):
            with np.load(self.path) as f:
                param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            lasagne.layers.set_all_param_values(self.out, param_values)

        self.predict_net = theano.compile.function([self.input_X], get_output(self.out))

        # Для обучения (Можно удалить)
        self.all_weights = lasagne.layers.get_all_params(self.out)

        self.y_predicted = get_output(self.out)
        self.loss = lasagne.objectives.categorical_crossentropy(self.y_predicted, self.target_y).mean()
        self.accuracy = lasagne.objectives.categorical_accuracy(self.y_predicted, self.target_y).mean()

        self.updates = lasagne.updates.adadelta(self.loss, self.all_weights, learning_rate=0.01)
        self.train_fun = theano.function([self.input_X, self.target_y], [self.loss, self.accuracy], updates=self.updates, allow_input_downcast=True)
        self.accuracy_fun = theano.function([self.input_X, self.target_y], self.accuracy, allow_input_downcast=True)
Ejemplo n.º 9
0
def get_model(input_var, target_var, multiply_var):

    # input layer with unspecified batch size
    layer     = InputLayer(shape=(None, 12, 64, 64), input_var=input_var) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var)
    layer     = DimshuffleLayer(layer, (0, 'x', 1, 2, 3))

    # Z-score?

    # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = Conv3DDNNLayer(incoming=layer, num_filters=1, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=sigmoid)
    layer_prediction  = layer

    # Loss
    prediction           = get_output(layer_prediction)
    loss                 = binary_crossentropy(prediction[:,0,:,:,:], target_var).mean()

    #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum
    params               = get_all_params(layer_prediction, trainable=True)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network, disabling dropout layers.
    test_prediction      = get_output(layer_prediction, deterministic=True)
    test_loss            = binary_crossentropy(test_prediction[:,0,:,:,:], target_var).mean()

    return test_prediction, prediction, loss, params
    def dist_info_sym(self, obs_var, state_info_vars):
        n_batches, n_steps = obs_var.shape[:2]
        obs_var = obs_var.reshape((n_batches, n_steps, -1))
        if self.state_include_action:
            prev_action_var = state_info_vars["prev_action"]
            all_input_var = TT.concatenate(
                [obs_var, prev_action_var],
                axis=2
            )
        else:
            all_input_var = obs_var

        if self.feature_network is None:
            return dict(
                prob=L.get_output(
                    self.prob_network.output_layer,
                    {self.l_input: all_input_var}
                )
            )
        else:
            flat_input_var = TT.reshape(all_input_var, (-1, self.input_dim))
            return dict(
                prob=L.get_output(
                    self.prob_network.output_layer,
                    {self.l_input: all_input_var, self.feature_network.input_layer: flat_input_var}
                )
            )
def test_reapply():
    
    l_in1 = InputLayer([None,10],T.zeros([5,10]))
    l_d1 = DenseLayer(l_in1,20)
    l_d2 = DenseLayer(l_in1,30)
    l_cat = ConcatLayer([l_d1,l_d2])
    l_d3 = DenseLayer(l_cat,20)
    
    l_in2 = InputLayer([None,10],T.zeros([5,10]))
    
    new_l_d3 = reapply(l_d3,{l_in1:l_in2})  #reapply the whole network to a new in 
    get_output(new_l_d3).eval()

    l_in3 = InputLayer([None,30],T.zeros([5,30]))
    
    new_l_d3 = reapply(l_d3,{l_d2:l_in3})
    
    #multiple inputs
    new_l_cat = reapply(l_cat,{l_d2:ConcatLayer([l_d1,l_d2]),l_in1:l_in2})
    get_output(new_l_cat).eval()
    
    #multiple layers
    l1,l2 = reapply([l_d3,l_d2],{l_in1:l_in2})
    outs = reapply({'d3':l_d3,'d2':l_d2},{l_in1:l_in2})
    
    assert isinstance(outs,dict)
    
    outs['d3'],outs['d2']
    def __init__(self, network_description):

        signal.signal(signal.SIGINT, self.signal_handler)
        self.name = network_description['name']
        netbuilder = NetworkBuilder(network_description)
        self.shouldStopNow  = False
        # Get the lasagne network using the network builder class that creates autoencoder with the specified architecture
        self.network = netbuilder.buildNetwork()
        self.encode_layer, self.encode_size = netbuilder.getEncodeLayerAndSize()
        self.t_input, self.t_target = netbuilder.getInputAndTargetVars()
        self.input_type = netbuilder.getInputType()
        self.batch_size = netbuilder.getBatchSize()
        rootLogger.info("Network: " + self.networkToStr())
        # Reconstruction is just output of the network
        recon_prediction_expression = layers.get_output(self.network)
        # Latent/Encoded space is the output of the bottleneck/encode layer
        encode_prediction_expression = layers.get_output(self.encode_layer, deterministic=True)
        # Loss for autoencoder = reconstruction loss + weight decay regularizer
        loss = self.getReconstructionLossExpression(recon_prediction_expression, self.t_target)
        weightsl2 = lasagne.regularization.regularize_network_params(self.network, lasagne.regularization.l2)
        loss += (5e-5 * weightsl2)
        params = lasagne.layers.get_all_params(self.network, trainable=True)
        # SGD with momentum + Decaying learning rate
        self.learning_rate = theano.shared(lasagne.utils.floatX(0.01))
        updates = lasagne.updates.nesterov_momentum(loss, params, learning_rate=self.learning_rate)
        # Theano functions for calculating loss, predicting reconstruction, encoding
        self.trainAutoencoder = theano.function([self.t_input, self.t_target], loss, updates=updates)
        self.predictReconstruction = theano.function([self.t_input], recon_prediction_expression)
        self.predictEncoding = theano.function([self.t_input], encode_prediction_expression)
Ejemplo n.º 13
0
def get_model(input_images, input_position, input_mult, target_var):

    # number of SAX and distance between SAX slices
    #indexes = []
    #for i in range(input_position.shape[0]):
    #    indexes.append(numpy.where(input_position[i][:,0] == 0.)[0][0])
    
    # input layer with unspecified batch size
    layer     = InputLayer(shape=(None, 22, 30, 64, 64), input_var=input_images) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var)
    
    # Z-score?

    # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    shortcut      = layer
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer 	  = ElemwiseSumLayer([layer, shortcut])
    layer         = batch_norm(Conv3DDNNLayer(incoming=layer, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=rectify))
    layer         = Conv3DDNNLayer(incoming=layer, num_filters=22, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=sigmoid)

    layer_max     = ExpressionLayer(layer, lambda X: X.max(1), output_shape='auto')
    layer_min     = ExpressionLayer(layer, lambda X: X.min(1), output_shape='auto')
    
    layer_prediction = layer
    # image prediction
    prediction           = get_output(layer_prediction)
        
    loss                 = binary_crossentropy(prediction, target_var).mean()

    #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum
    params               = get_all_params(layer_prediction, trainable=True)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network, disabling dropout layers.
    test_prediction      = get_output(layer_prediction, deterministic=True)
    test_loss            = binary_crossentropy(test_prediction, target_var).mean()

    return test_prediction, prediction, loss, params
def test_memory_cells(batch_size=3, seq_len=50, input_dim=8, n_hidden=16):
    # lasagne way
    l_in = InputLayer((None, seq_len, input_dim),
                      input_var=theano.shared(np.random.normal(size=[batch_size, seq_len, input_dim])),
                      name='input seq')

    l_lstm0 = LSTMLayer(l_in, n_hidden, name='lstm')
    l_gru0 = GRULayer(l_in, n_hidden, name='gru')

    f_predict0 = theano.function([], get_output([l_lstm0, l_gru0]))

    # agentnet way
    s_in = InputLayer((None, input_dim), name='in')

    s_prev_cell = InputLayer((None, n_hidden), name='cell')
    s_prev_hid = InputLayer((None, n_hidden), name='hid')
    s_lstm_cell, s_lstm_hid = LSTMCell(s_prev_cell, s_prev_hid, s_in, name='lstm')

    s_prev_gru = InputLayer((None, n_hidden), name='hid')
    s_gru = GRUCell(s_prev_gru, s_in, name='gru')

    rec = Recurrence(state_variables=OrderedDict({
        s_lstm_cell: s_prev_cell,
        s_lstm_hid: s_prev_hid,
        s_gru: s_prev_gru}),
        input_sequences={s_in: l_in},
        unroll_scan=False)

    state_seqs, _ = rec.get_sequence_layers()

    l_lstm1 = state_seqs[s_lstm_hid]
    l_gru1 = state_seqs[s_gru]

    f_predict1 = theano.function([], get_output([l_lstm1, l_gru1]))

    # lstm param transfer
    old_params = sorted(get_all_params(l_lstm0, trainable=True), key=lambda p: p.name)
    new_params = sorted(get_all_params(s_lstm_hid, trainable=True), key=lambda p: p.name)

    for old, new in zip(old_params, new_params):
        print (old.name, '<-', new.name)
        assert tuple(old.shape.eval()) == tuple(new.shape.eval())
        old.set_value(new.get_value())

    # gru param transfer
    old_params = sorted(get_all_params(l_gru0, trainable=True), key=lambda p: p.name)
    new_params = sorted(get_all_params(s_gru, trainable=True), key=lambda p: p.name)

    for old, new in zip(old_params, new_params):
        print (old.name, '<-', new.name)
        assert tuple(old.shape.eval()) == tuple(new.shape.eval())
        old.set_value(new.get_value())

    lstm0_out, gru0_out = f_predict0()
    lstm1_out, gru1_out = f_predict1()

    assert np.allclose(lstm0_out, lstm1_out)
    assert np.allclose(gru0_out, gru1_out)
    def _create_iter_funcs(self, layers, objective, update, output_type):
        y_batch = output_type('y_batch')

        output_layer = list(layers.values())[-1]
        objective_params = self._get_params_for('objective')
        obj = objective(output_layer, **objective_params)
        if not hasattr(obj, 'layers'):
            # XXX breaking the Lasagne interface a little:
            obj.layers = layers

        loss_train = obj.get_loss(None, y_batch)
        loss_eval = obj.get_loss(None, y_batch, deterministic=True)
        predict_proba = get_output(output_layer, None, deterministic=True)

        try:
            transform = get_output([v for k, v in layers.items() 
                                   if 'rmspool' in k or 'maxpool' in k][-1],
                                   None, deterministic=True)
        except IndexError:
            transform = get_output(layers.values()[-2], None,
                                   deterministic=True)

        if not self.regression:
            predict = predict_proba.argmax(axis=1)
            accuracy = T.mean(T.eq(predict, y_batch))
        else:
            accuracy = loss_eval

        all_params = self.get_all_params(trainable=True)
        update_params = self._get_params_for('update')
        updates = update(loss_train, all_params, **update_params)

        input_layers = [layer for layer in layers.values()
                        if isinstance(layer, InputLayer)]

        X_inputs = [theano.Param(input_layer.input_var, name=input_layer.name)
                    for input_layer in input_layers]
        inputs = X_inputs + [theano.Param(y_batch, name="y")]

        train_iter = theano.function(
            inputs=inputs,
            outputs=[loss_train],
            updates=updates,
            )
        eval_iter = theano.function(
            inputs=inputs,
            outputs=[loss_eval, accuracy],
            )
        predict_iter = theano.function(
            inputs=X_inputs,
            outputs=predict_proba,
            )
        transform_iter = theano.function(
            inputs=X_inputs,
            outputs=transform,
            )
        return train_iter, eval_iter, predict_iter, transform_iter
    def doClusteringWithKMeansLoss(self, dataset, epochs):
        '''
        Trains the autoencoder with combined kMeans loss and reconstruction loss
        At the moment does not give good results
        :param dataset: Data on which the autoencoder is trained
        :param epochs: Number of training epochs
        :return: None - (side effect) saves the trained network params and latent space in appropriate location
        '''
        batch_size = self.batch_size
        # Load the inputs in latent space produced by the pretrained autoencoder and use it to initialize cluster centers
        Z = np.load('saved_params/%s/z_%s.npy' % (dataset.name, self.name))
        quality_desc, cluster_centers = evaluateKMeans(Z, dataset.labels, dataset.getClusterCount(), 'Initial')
        rootLogger.info(quality_desc)
        # Load network parameters - code borrowed from mnist lasagne example
        with np.load('saved_params/%s/m_%s.npz' % (dataset.name, self.name)) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
            lasagne.layers.set_all_param_values(self.network, param_values, trainable=True)
        # reconstruction loss is just rms loss between input and reconstructed input
        reconstruction_loss = self.getReconstructionLossExpression(layers.get_output(self.network), self.t_target)
        # extent the network to do soft cluster assignments
        clustering_network = ClusteringLayer(self.encode_layer, dataset.getClusterCount(), cluster_centers, batch_size, self.encode_size)
        soft_assignments = layers.get_output(clustering_network)
        # k-means loss is the sum of distances from the cluster centers weighted by the soft assignments to the clusters
        kmeansLoss = self.getKMeansLoss(layers.get_output(self.encode_layer), soft_assignments, clustering_network.W, dataset.getClusterCount(), self.encode_size, batch_size)
        params = lasagne.layers.get_all_params(self.network, trainable=True)
        # total loss = reconstruction loss + lambda * kmeans loss
        weight_reconstruction = 1
        weight_kmeans = 0.1
        total_loss = weight_kmeans * kmeansLoss + weight_reconstruction * reconstruction_loss
        updates = lasagne.updates.nesterov_momentum(total_loss, params, learning_rate=0.01)
        trainKMeansWithAE = theano.function([self.t_input, self.t_target], total_loss, updates=updates)
        for epoch in range(epochs):
            error = 0
            total_batches = 0
            for batch in dataset.iterate_minibatches(self.input_type, batch_size, shuffle=True):
                inputs, targets = batch
                error += trainKMeansWithAE(inputs, targets)
                total_batches += 1
            # For every 10th epoch, update the cluster centers and print the clustering accuracy and nmi - for checking if the network
            # is actually doing something meaningful - the labels are never used for training
            if (epoch + 1) % 10 == 0:
                for i, batch in enumerate(dataset.iterate_minibatches(self.input_type, batch_size, shuffle=False)):
                    Z[i * batch_size:(i + 1) * batch_size] = self.predictEncoding(batch[0])
                quality_desc, cluster_centers = evaluateKMeans(Z, dataset.labels, dataset.getClusterCount(), "%d/%d [%.4f]" % (epoch + 1, epochs, error / total_batches))
                rootLogger.info(quality_desc)
            else:
                # Just print the training loss
                rootLogger.info("%-30s     %8s     %8s" % ("%d/%d [%.4f]" % (epoch + 1, epochs, error / total_batches), "", ""))
            if self.shouldStopNow:
            	break

        # Save the inputs in latent space and the network parameters
        for i, batch in enumerate(dataset.iterate_minibatches(self.input_type, batch_size, shuffle=False)):
            Z[i * batch_size:(i + 1) * batch_size] = self.predictEncoding(batch[0])
        np.save('saved_params/%s/pc_km_z_%s.npy' % (dataset.name, self.name), Z)
        np.savez('saved_params/%s/pc_km_m_%s.npz' % (dataset.name, self.name),
                 *lasagne.layers.get_all_param_values(self.network, trainable=True))
Ejemplo n.º 17
0
    def build_treatment_model(self, n_vars, **kwargs):

        input_vars = TT.matrix()
        instrument_vars = TT.matrix()
        targets = TT.vector()

        inputs = layers.InputLayer((None, n_vars), input_vars)
        inputs = layers.DropoutLayer(inputs, p=0.2)

        dense_layer = layers.DenseLayer(inputs, 2 * kwargs['dense_size'], nonlinearity=nonlinearities.rectify)
        dense_layer = layers.batch_norm(dense_layer)
        dense_layer= layers.DropoutLayer(dense_layer, p=0.2)

        for _ in xrange(kwargs['n_dense_layers'] - 1):
            dense_layer = layers.DenseLayer(dense_layer, kwargs['dense_size'], nonlinearity=nonlinearities.rectify)
            dense_layer = layers.batch_norm(dense_layer)

        self.treatment_output = layers.DenseLayer(dense_layer, 1, nonlinearity=nonlinearities.linear)
        init_params = layers.get_all_param_values(self.treatment_output)

        prediction = layers.get_output(self.treatment_output, deterministic=False)
        test_prediction = layers.get_output(self.treatment_output, deterministic=True)

        l2_cost = regularization.regularize_network_params(self.treatment_output, regularization.l2)
        loss = gmm_loss(prediction, targets, instrument_vars) + 1e-4 * l2_cost

        params = layers.get_all_params(self.treatment_output, trainable=True)
        param_updates = updates.adadelta(loss, params)

        self._train_fn = theano.function(
            [
                input_vars,
                targets,
                instrument_vars,
            ],
            loss,
            updates=param_updates
        )

        self._loss_fn = theano.function(
            [
                input_vars,
                targets,
                instrument_vars,
            ],
            loss,
        )

        self._output_fn = theano.function(
            [
                input_vars,
            ],
            test_prediction,
        )

        return init_params
Ejemplo n.º 18
0
    def _create_iter_funcs(self, layers, objective, update, output_type):
        y_batch = output_type('y_batch')

        output_layer = layers[-1]
        objective_kw = self._get_params_for('objective')

        loss_train = objective(
            layers, target=y_batch, **objective_kw)
        loss_eval = objective(
            layers, target=y_batch, deterministic=True, **objective_kw)
        predict_proba = get_output(output_layer, None, deterministic=True)
        if not self.regression:
            predict = predict_proba.argmax(axis=1)
            accuracy = T.mean(T.eq(predict, y_batch))
        else:
            accuracy = loss_eval

        all_params = self.get_all_params(trainable=True)
        update_params = self._get_params_for('update')
        updates = update(loss_train, all_params, **update_params)

        input_layers = [layer for layer in layers.values()
                        if isinstance(layer, InputLayer)]

        X_inputs = [theano.Param(input_layer.input_var, name=input_layer.name)
                    for input_layer in input_layers]
        inputs = X_inputs + [theano.Param(y_batch, name="y")]

        train_iter = theano.function(
            inputs=inputs,
            outputs=[loss_train],
            updates=updates,
            allow_input_downcast=True,
            )
        eval_iter = theano.function(
            inputs=inputs,
            outputs=[loss_eval, accuracy],
            allow_input_downcast=True,
            )
        predict_iter = theano.function(
            inputs=X_inputs,
            outputs=predict_proba,
            allow_input_downcast=True,
            )
        
        #Ido addition:
        h_predict = get_output(layers[self.hiddenLayer_to_output], None, deterministic=True)
  
        output_last_hidden_layer_ = theano.function(
            inputs=X_inputs,
            outputs=h_predict,
            allow_input_downcast=True,
            )
        
        return train_iter, eval_iter, predict_iter, output_last_hidden_layer_
Ejemplo n.º 19
0
def get_model(input_var, target_var, multiply_var):

    # input layer with unspecified batch size
    layer_input     = InputLayer(shape=(None, 30, 80, 80), input_var=input_var) #InputLayer(shape=(None, 1, 30, 64, 64), input_var=input_var)
    layer_0         = DimshuffleLayer(layer_input, (0, 'x', 1, 2, 3))

    # Z-score?

    # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer
    layer_1         = batch_norm(Conv3DDNNLayer(incoming=layer_0, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_2         = batch_norm(Conv3DDNNLayer(incoming=layer_1, num_filters=16, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_3         = MaxPool3DDNNLayer(layer_2, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1))
    layer_4         = DropoutLayer(layer_3, p=0.25)

    # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer
    layer_5         = batch_norm(Conv3DDNNLayer(incoming=layer_4, num_filters=32, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_6         = batch_norm(Conv3DDNNLayer(incoming=layer_5, num_filters=32, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_7         = MaxPool3DDNNLayer(layer_6, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1))
    layer_8         = DropoutLayer(layer_7, p=0.25)
    
    # Convolution then batchNormalisation then activation layer, then zero padding layer followed by a dropout layer
    layer_5         = batch_norm(Conv3DDNNLayer(incoming=layer_8, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_6         = batch_norm(Conv3DDNNLayer(incoming=layer_5, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_7         = batch_norm(Conv3DDNNLayer(incoming=layer_6, num_filters=64, filter_size=(3,3,3), stride=(1,1,1), pad='same', nonlinearity=leaky_rectify))
    layer_8         = MaxPool3DDNNLayer(layer_7, pool_size=(2, 2, 2), stride=(2, 2, 2), pad=(1, 1, 1))
    layer_9         = DropoutLayer(layer_8, p=0.25)

    # LSTM
    layer         = DimshuffleLayer(layer_9, (0,2,1,3,4))
#    layer_prediction  = LSTMLayer(layer, num_units=2, only_return_final=True, learn_init=True, cell=Gate(linear))
    layer = LSTMLayer(layer, num_units=2, only_return_final=True, learn_init=True)
    layer_prediction = DenseLayer(layer, 2, nonlinearity=linear)

    # Output Layer
    # layer_hidden         = DenseLayer(layer_flatten, 500, nonlinearity=linear)
    # layer_prediction     = DenseLayer(layer_hidden, 2, nonlinearity=linear)

    # Loss
    prediction           = get_output(layer_prediction) / multiply_var**2
    loss                 = T.abs_(prediction - target_var)
    loss                 = loss.mean()

    #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum
    params               = get_all_params(layer_prediction, trainable=True)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network, disabling dropout layers.
    test_prediction      = get_output(layer_prediction, deterministic=True) / multiply_var**2
    test_loss            = T.abs_(test_prediction - target_var)
    test_loss            = test_loss.mean()

    # crps estimate
    crps                 = T.abs_(test_prediction - target_var).mean()/600
    
    return test_prediction, crps, loss, params
Ejemplo n.º 20
0
    def _compile(self):
        rc = self.rc

        # actor gradient step
        O = self.net.O
        V = ll.get_output(self.net.critic)
        params = self.net.actor_params
        regl_params = ll.get_all_params(self.net.actor, regularizable=True)
        regl = 0.5*rc['l2_actor']*tt.sum([tt.sum(p**2) for p in regl_params])
        updates = rc['gradient_updates'](V.mean()+regl, params, learning_rate=rc['lr_actor'])
        self.update_actor = th.function([O], [V.mean()], updates=updates)

        # critic bellman error (test version, doesn't update parameters)
        U = tt.matrix()
        Q = ll.get_output(self.net.critic, inputs={self.net.actor: U})
        Y = tt.matrix()
        J = 0.5*tt.mean((Y-Q)**2)
        self.J = th.function([O, U, Y], J)

        # critic bellman error (train version, does update parameters)
        regl_params = [p for p in ll.get_all_params(self.net.critic, regularizable=True)
                if p not in ll.get_all_params(self.net.actor)]
        regl = 0.5*rc['l2_critic']*tt.sum([tt.sum(p**2) for p in regl_params])
        params = self.net.critic_params
        updates = rc['gradient_updates'](J+regl, params, learning_rate=rc['lr_critic'])
        self.update_critic = th.function([O, U, Y], J, updates=updates)

        # target network update
        updates = []
        tau = rc['tau']
        for p,tgt_p in zip(self.net.all_params, self.target_net.all_params):
            updates.append( (tgt_p, tau*p + (1-tau)*tgt_p) )
        self.update_target = th.function([], [], updates=updates)

        # build cost function
        # TODO: handle this better through rc
        x = tt.vector()
        u = tt.vector()
        site_xpos = tt.matrix()

        # L2 costs
        c = 0.5*rc['l2_q']*tt.sum(x[:self.model['nq']]**2)
        c += 0.5*rc['l2_v']*tt.sum(x[-self.model['nv']:]**2)
        c += 0.5*rc['l2_u']*tt.sum(u**2)

        # Huber costs
        if rc['huber_site'] is not None:
            a = rc['huber_alpha']
            d = site_xpos[0] - site_xpos[1]
            c += rc['huber_site']*(tt.sqrt(tt.sum(d**2) + a**2) - a)

        # compile cost function
        # TODO: remove need for 'on_unused_input'
        self.cost = th.function([x, u, site_xpos], c, on_unused_input='ignore')
Ejemplo n.º 21
0
def triplet_loss_iter(embedder, update_params={}):
    X_triplets = {
            'anchor':T.tensor4(),
            'positive':T.tensor4(),
            'negative':T.tensor4(),
            } # each will be a batch of images

    final_emb_layer = embedder[-1]
    all_layers = ll.get_all_layers(embedder)
    imwrite_architecture(all_layers, './layer_rep.png')
    # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred)
    # another assumption (which must hold when the network is being made)
    # the last prediction layer is a) the end of the network and b) what we ultimately care about
    # however the other prediction layers will be incorporated into the training loss
    predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_triplets.items()}
    predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_triplets.items()}

    # each output should be batch_size x embed_size

    # should give us a vector of batch_size of distances btw anchor and positive
    alpha = 0.2 # FaceNet alpha
    triplet_pos = lambda pred: (pred['anchor'] - pred['positive']).norm(2,axis=1)
    triplet_neg = lambda pred: (pred['anchor'] - pred['negative']).norm(2,axis=1)
    triplet_distances = lambda pred: (triplet_pos(pred) - triplet_neg(pred) + alpha).clip(0, np.inf)
    triplet_failed = lambda pred: T.mean(triplet_distances(pred) > alpha)
    triplet_loss = lambda pred: T.sum(triplet_distances(pred))

    decay = 0.001
    reg = regularize_network_params(final_emb_layer, l2) * decay
    losses_reg = lambda pred: triplet_loss(pred) + reg
    loss_train = losses_reg(predicted_embeds_train)
    loss_train.name = 'TL' # for the names
    #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder]))
    all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots'
    grads = T.grad(loss_train, all_params, add_names=True)
    updates = adam(grads, all_params)
    #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum'])

    print("Compiling network for training")
    tic = time.time()
    train_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [loss_train] + grads, updates=updates)
    toc = time.time() - tic
    print("Took %0.2f seconds" % toc)
    #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True)
    print("Compiling network for validation")
    tic = time.time()
    valid_iter = theano.function([X_triplets['anchor'], X_triplets['positive'], X_triplets['negative']], [triplet_loss(predicted_embeds_valid),
                                                                                                          losses_reg(predicted_embeds_valid),
                                                                                                          triplet_failed(predicted_embeds_valid)])
    toc = time.time() - tic
    print("Took %0.2f seconds" % toc)

    return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
Ejemplo n.º 22
0
def contrastive_loss_iter(embedder, update_params={}):
    X_pairs = {
            'img1':T.tensor4(),
            'img2':T.tensor4(),
            }
    y = T.ivector() # basically class labels

    final_emb_layer = embedder[-1]
    all_layers = ll.get_all_layers(embedder)
    imwrite_architecture(all_layers, './layer_rep.png')
    # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred)
    # another assumption (which must hold when the network is being made)
    # the last prediction layer is a) the end of the network and b) what we ultimately care about
    # however the other prediction layers will be incorporated into the training loss
    predicted_embeds_train = {k:ll.get_output(embedder, X)[-1] for k, X in X_pairs.items()}
    predicted_embeds_valid = {k:ll.get_output(final_emb_layer, X, deterministic=True) for k, X in X_pairs.items()}

    margin = 1

    # if distance is 0 that's bad
    distance = lambda pred: (pred['img1'] - pred['img2'] + 1e-7).norm(2, axis=1)
    contrastive_loss = lambda pred: T.mean(y*(distance(pred)) + (1 - y)*(margin - distance(pred)).clip(0,np.inf))
    failed_matches = lambda pred: T.switch(T.eq(T.sum(y),0), 0, T.sum((y*distance(pred)) > margin) / T.sum(y))
    failed_nonmatches = lambda pred: T.switch(T.eq(T.sum(1-y),0), 0, T.sum((1-y*distance(pred)) < margin) / T.sum(1-y))
    failed_pairs = lambda pred: 0.5*failed_matches(pred) + 0.5*failed_nonmatches(pred)

    decay = 0.0001
    reg = regularize_network_params(final_emb_layer, l2) * decay
    losses_reg = lambda pred: contrastive_loss(pred) + reg
    loss_train = losses_reg(predicted_embeds_train)
    loss_train.name = 'CL' # for the names
    #all_params = list(chain(*[ll.get_all_params(pred) for pred in embedder]))
    all_params = ll.get_all_params(embedder, trainable=True) # this should work with multiple 'roots'
    grads = T.grad(loss_train, all_params, add_names=True)
    updates = adam(grads, all_params)
    #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum'])

    print("Compiling network for training")
    tic = time.time()
    train_iter = theano.function([X_pairs['img1'], X_pairs['img2'], y], [loss_train] + grads, updates=updates)
    toc = time.time() - tic
    print("Took %0.2f seconds" % toc)
    #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True)
    print("Compiling network for validation")
    tic = time.time()
    valid_iter = theano.function([X_pairs['img1'], X_pairs['img2'], y], [
                                    contrastive_loss(predicted_embeds_valid),
                                    losses_reg(predicted_embeds_valid),
                                    failed_pairs(predicted_embeds_valid)])
    toc = time.time() - tic
    print("Took %0.2f seconds" % toc)

    return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
Ejemplo n.º 23
0
def create_generator_func(layers, apply_updates=False):
    X = T.fmatrix('X')
    X_batch = T.fmatrix('X_batch')

    # no need to pass an input to l_prior_in here
    generator_outputs = get_output(
        layers['l_encoder_out'], X, deterministic=False)

    # so pass the output of the generator as the output of the concat layer
    discriminator_outputs = get_output(
        layers['l_discriminator_out'],
        inputs={
            layers['l_prior_encoder_concat']: generator_outputs,
        },
        deterministic=False
    )

    # the discriminator learns to predict 1 for q(z|x),
    # so the generator should fool it into predicting 0
    generator_targets = T.zeros_like(X_batch.shape[0])

    # so the generator needs to push the discriminator's output to 0
    generator_loss = T.mean(
        T.nnet.binary_crossentropy(
            discriminator_outputs,
            generator_targets,
        )
    )

    if apply_updates:
        # only layers that are part of the generator (i.e., encoder)
        # should be updated
        generator_params = get_all_params(
            layers['l_discriminator_out'], trainable=True, generator=True)

        generator_updates = nesterov_momentum(
            generator_loss, generator_params, 0.1, 0.0)
    else:
        generator_updates = None

    generator_func = theano.function(
        inputs=[
            theano.In(X_batch),
        ],
        outputs=generator_loss,
        updates=generator_updates,
        givens={
            X: X_batch,
        },
    )

    return generator_func
Ejemplo n.º 24
0
def loss_iter(segmenter, update_params={}):
    X = T.tensor4()
    y = T.tensor4()
    pixel_weights = T.tensor3()

    final_pred_layer = segmenter[-1]
    all_layers = ll.get_all_layers(segmenter)
    imwrite_architecture(all_layers, './layer_rep.png')
    # assume we get a list of predictions (e.g. for jet architecture, but should work w/just one pred)
    # another assumption (which must hold when the network is being made)
    # the last prediction layer is a) the end of the network and b) what we ultimately care about
    # however the other prediction layers will be incorporated into the training loss
    predicted_masks_train = ll.get_output(segmenter, X)
    predicted_mask_valid = ll.get_output(final_pred_layer, X, deterministic=True)

    thresh = 0.5
    accuracy = lambda pred: T.mean(T.eq(T.argmax(pred, axis=1), T.argmax(y, axis=1)))
    true_pos = lambda pred: T.sum((pred[:,0,:,:] > thresh) * (y[:,0,:,:] > thresh))
    false_pos = lambda pred: T.sum((pred[:,0,:,:] > thresh) - (y[:,0,:,:] > thresh))
    precision = lambda pred: (true_pos(pred) / (true_pos(pred) + false_pos(pred)))

    pixel_weights_1d = pixel_weights.flatten(ndim=1)
    losses = lambda pred: T.mean(crossentropy_flat(pred + 1e-7, y + 1e-7) * pixel_weights_1d)

    decay = 0.0001
    reg = regularize_network_params(final_pred_layer, l2) * decay
    losses_reg = lambda pred: losses(pred) + reg
    loss_train = T.sum([losses_reg(mask) for mask in predicted_masks_train])
    loss_train.name = 'CE' # for the names
    #all_params = list(chain(*[ll.get_all_params(pred) for pred in segmenter]))
    all_params = ll.get_all_params(segmenter, trainable=True) # this should work with multiple 'roots'
    grads = T.grad(loss_train, all_params, add_names=True)
    updates = adam(grads, all_params)
    #updates = nesterov_momentum(grads, all_params, update_params['l_r'], momentum=update_params['momentum'])
    acc_train = accuracy(predicted_masks_train[-1])
    acc_valid = accuracy(predicted_mask_valid)
    prec_train = precision(predicted_masks_train[-1])
    prec_valid = precision(predicted_mask_valid)

    print("Compiling network for training")
    tic = time.time()
    train_iter = theano.function([X, y, pixel_weights], [loss_train] + grads, updates=updates)
    toc = time.time() - tic
    print("Took %0.2f seconds" % toc)
    #theano.printing.pydotprint(loss, outfile='./loss_graph.png',var_with_name_simple=True)
    print("Compiling network for validation")
    tic = time.time()
    valid_iter = theano.function([X, y, pixel_weights], [losses(predicted_mask_valid), losses_reg(predicted_mask_valid), prec_valid])
    toc = time.time() - tic
    print("Took %0.2f seconds" % toc)

    return {'train':train_iter, 'valid':valid_iter, 'gradnames':[g.name for g in grads]}
Ejemplo n.º 25
0
def get_network(model):

    input_data = tensor.dmatrix('x')
    targets_var = tensor.dmatrix('y')

    network = layers.InputLayer((model['batch_size'], model['input_vars']), input_data)

    nonlin = nonlinearities.rectify
    if model['hidden_nonlinearity'] != 'ReLu':
        nonlin = nonlinearities.tanh

    prev_layer = network

    for l in range(model['nlayers']):
        fc = layers.DenseLayer(prev_layer, model['units'], nonlinearity=nonlin)
        if model['dropout']:
            fc = layers.DropoutLayer(fc, 0.5)
        prev_layer = fc

    output_lin = None
    if model['output_mode'] == OUTPUT_LOG:
        output_lin = nonlinearities.tanh
    output_layer = layers.DenseLayer(prev_layer, 1, nonlinearity=output_lin)

    predictions = layers.get_output(output_layer)

    if model['output_mode'] == OUTPUT_BOUNDED:
        (minth, maxth) = model['maxmin'][model['control']]
        maxt = theano.shared(np.ones((model['batch_size'], 1)) * maxth)
        mint = theano.shared(np.ones((model['batch_size'], 1)) * minth)
        predictions = tensor.min(tensor.concatenate([maxt, predictions], axis=1), axis=1)
        predictions = tensor.reshape(predictions, (model['batch_size'], 1))
        predictions = tensor.max(tensor.concatenate([mint, predictions], axis=1), axis=1)
        predictions = tensor.reshape(predictions, (model['batch_size'], 1))

    loss = objectives.squared_error(predictions, targets_var)
    loss = objectives.aggregate(loss, mode='mean')

    params = layers.get_all_params(output_layer)

    test_prediction = layers.get_output(output_layer, deterministic=True)
    test_loss = objectives.squared_error(test_prediction,  targets_var)
    test_loss = test_loss.mean()

    updates_sgd = updates.sgd(loss, params, learning_rate=model['lr'])
    ups = updates.apply_momentum(updates_sgd, params, momentum=0.9)

    train_fn = theano.function([input_data, targets_var], loss, updates=ups)
    pred_fn = theano.function([input_data], predictions)
    val_fn = theano.function([input_data, targets_var], test_loss)

    return {'train': train_fn, 'eval': val_fn, 'pred': pred_fn, 'layers': output_layer}
Ejemplo n.º 26
0
def get_model(input_var, target_var, multiply_var):

    # input layer with unspecified batch size
    layer_both_0         = InputLayer(shape=(None, 30, 64, 64), input_var=input_var)

    # Z-score?

    # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_both_1         = batch_norm(Conv2DLayer(layer_both_0, 64, (3, 3), pad='same', nonlinearity=leaky_rectify))
    layer_both_2         = batch_norm(Conv2DLayer(layer_both_1, 64, (3, 3), pad='same', nonlinearity=leaky_rectify))
    layer_both_3         = MaxPool2DLayer(layer_both_2, pool_size=(2, 2), stride=(2, 2), pad=(1, 1))
    layer_both_4         = DropoutLayer(layer_both_3, p=0.25)

    # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_both_5         = batch_norm(Conv2DLayer(layer_both_4, 128, (3, 3), pad='same', nonlinearity=leaky_rectify))
    layer_both_6         = batch_norm(Conv2DLayer(layer_both_5, 128, (3, 3), pad='same', nonlinearity=leaky_rectify))
    layer_both_7         = MaxPool2DLayer(layer_both_6, pool_size=(2, 2), stride=(2, 2), pad=(1, 1))
    layer_both_8         = DropoutLayer(layer_both_7, p=0.25)

    # Convolution then batchNormalisation then activation layer, twice, then zero padding layer followed by a dropout layer
    layer_both_9         = batch_norm(Conv2DLayer(layer_both_8, 256, (3, 3), pad='same', nonlinearity=leaky_rectify))
    layer_both_10        = batch_norm(Conv2DLayer(layer_both_9, 256, (3, 3), pad='same', nonlinearity=leaky_rectify))
    layer_both_11        = batch_norm(Conv2DLayer(layer_both_10, 256, (3, 3), pad='same', nonlinearity=leaky_rectify))
    layer_both_12        = MaxPool2DLayer(layer_both_11, pool_size=(2, 2), stride=(2, 2), pad=(1, 1))
    layer_both_13        = DropoutLayer(layer_both_12, p=0.25)

    # Flatten
    layer_flatten        = FlattenLayer(layer_both_13)

    # Prediction
    layer_hidden         = DenseLayer(layer_flatten, 500, nonlinearity=sigmoid)
    layer_prediction     = DenseLayer(layer_hidden, 2, nonlinearity=linear)

    # Loss
    prediction           = get_output(layer_prediction) / multiply_var
    loss                 = squared_error(prediction, target_var)
    loss                 = loss.mean()


    #Updates : Stochastic Gradient Descent (SGD) with Nesterov momentum
    params               = get_all_params(layer_prediction, trainable=True)

    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network, disabling dropout layers.
    test_prediction      = get_output(layer_prediction, deterministic=True) / multiply_var
    test_loss            = squared_error(test_prediction, target_var)
    test_loss            = test_loss.mean()

    # crps estimate
    crps                 = T.abs_(test_prediction - target_var).mean()/600

    return test_prediction, crps, loss, params
Ejemplo n.º 27
0
 def get_train_loss(self, target_vars, params):
     assert len(target_vars) == 1
     prediction = get_output(self.l_out)
     mean_loss = self.loss(prediction, target_vars[0]).mean()
     monitored = [('loss', mean_loss)]
     grads = T.grad(mean_loss, params)
     if self.options.monitor_grads:
         for p, grad in zip(params, grads):
             monitored.append(('grad/' + p.name, grad))
     if self.options.monitor_activations:
         for name, layer in get_named_layers(self.l_out).iteritems():
             monitored.append(('activation/' + name, get_output(layer)))
     return OrderedDict(monitored), grads, []
Ejemplo n.º 28
0
    def _init_model(self, in_size, out_size, n_hid=10, learning_rate_sl=0.005, \
            learning_rate_rl=0.005, batch_size=32, ment=0.1):
        # 2-layer MLP
        self.in_size = in_size # x and y coordinate
        self.out_size = out_size # up, down, right, left
        self.batch_size = batch_size
        self.learning_rate = learning_rate_rl
        self.n_hid = n_hid

        input_var, turn_mask, act_mask, reward_var = T.ftensor3('in'), T.imatrix('tm'), \
                T.itensor3('am'), T.fvector('r')

        in_var = T.reshape(input_var, (input_var.shape[0]*input_var.shape[1],self.in_size))

        l_mask_in = L.InputLayer(shape=(None,None), input_var=turn_mask)

        pol_in = T.fmatrix('pol-h')
        l_in = L.InputLayer(shape=(None,None,self.in_size), input_var=input_var)
        l_pol_rnn = L.GRULayer(l_in, n_hid, hid_init=pol_in, mask_input=l_mask_in) # B x H x D
        pol_out = L.get_output(l_pol_rnn)[:,-1,:]
        l_den_in = L.ReshapeLayer(l_pol_rnn, (turn_mask.shape[0]*turn_mask.shape[1], n_hid)) # BH x D
        l_out = L.DenseLayer(l_den_in, self.out_size, nonlinearity=lasagne.nonlinearities.softmax)

        self.network = l_out
        self.params = L.get_all_params(self.network)

        # rl
        probs = L.get_output(self.network) # BH x A
        out_probs = T.reshape(probs, (input_var.shape[0],input_var.shape[1],self.out_size)) # B x H x A
        log_probs = T.log(out_probs)
        act_probs = (log_probs*act_mask).sum(axis=2) # B x H
        ep_probs = (act_probs*turn_mask).sum(axis=1) # B
        H_probs = -T.sum(T.sum(out_probs*log_probs,axis=2),axis=1) # B
        self.loss = 0.-T.mean(ep_probs*reward_var + ment*H_probs)

        updates = lasagne.updates.rmsprop(self.loss, self.params, learning_rate=learning_rate_rl, \
                epsilon=1e-4)

        self.inps = [input_var, turn_mask, act_mask, reward_var, pol_in]
        self.train_fn = theano.function(self.inps, self.loss, updates=updates)
        self.obj_fn = theano.function(self.inps, self.loss)
        self.act_fn = theano.function([input_var, turn_mask, pol_in], [out_probs, pol_out])

        # sl
        sl_loss = 0.-T.mean(ep_probs)
        sl_updates = lasagne.updates.rmsprop(sl_loss, self.params, learning_rate=learning_rate_sl, \
                epsilon=1e-4)

        self.sl_train_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss, \
                updates=sl_updates)
        self.sl_obj_fn = theano.function([input_var, turn_mask, act_mask, pol_in], sl_loss)
Ejemplo n.º 29
0
def build_optimizer(network, placeholders, optimization, learning_rate):

	# build loss function 
	
	if optimization['objective'] == 'lower_bound':
		if 'binary' in optimization:
			binary = optimization['binary']
		else:
			binary = False

		loss, prediction = variational_lower_bound(network, placeholders['inputs'], 
													deterministic=False, binary=binary)

		# regularize parameters
		loss += regularization(network['X'], optimization)	

		params = layers.get_all_params(network['X'], trainable=True)

	else:
		prediction = layers.get_output(network['output'], deterministic=False)
		loss = build_loss(placeholders['targets'], prediction, optimization)

		# regularize parameters
		loss += regularization(network['output'], optimization)

		params = layers.get_all_params(network['output'], trainable=True)    


	# calculate and clip gradients
	if "weight_norm" in optimization:
		weight_norm = optimization['weight_norm']
	else:
		weight_norm = None
	grad = calculate_gradient(loss, params, weight_norm=weight_norm)
	  
	# setup parameter updates
	update_op = build_updates(grad, params, optimization, learning_rate)

	# test/validation set 
	if optimization['objective'] == 'lower_bound':
		test_loss, test_prediction = variational_lower_bound(network, placeholders['inputs'], deterministic=False, binary=binary)	
	else:
		test_prediction = layers.get_output(network['output'], deterministic=True)
		test_loss = build_loss(placeholders['targets'], test_prediction, optimization)
			
	# create theano function
	train_fun = theano.function(list(placeholders.values()), [loss, prediction], updates=update_op)
	test_fun = theano.function(list(placeholders.values()), [test_loss, test_prediction])

	return train_fun, test_fun
Ejemplo n.º 30
0
Archivo: bidnn.py Proyecto: v-v/BiDNN
    def __init__(self, conf):
        self.conf = conf

        if self.conf.act == "linear":
            self.conf.act = linear
        elif self.conf.act == "sigmoid":
            self.conf.act = sigmoid
        elif self.conf.act == "relu":
            self.conf.act = rectify
        elif self.conf.act == "tanh":
            self.conf.act = tanh
        else:
            raise ValueError("Unknown activation function", self.conf.act)

        input_var_first   = T.matrix('inputs1')
        input_var_second  = T.matrix('inputs2')
        target_var        = T.matrix('targets')

        # create network        
        self.autoencoder, encoder_first, encoder_second = self.__create_toplogy__(input_var_first, input_var_second)
        
        self.out = get_output(self.autoencoder)
        
        loss = squared_error(self.out, target_var)
        loss = loss.mean()
        
        params = get_all_params(self.autoencoder, trainable=True)
        updates = nesterov_momentum(loss, params, learning_rate=self.conf.lr, momentum=self.conf.momentum)
        
        # training function
        self.train_fn = theano.function([input_var_first, input_var_second, target_var], loss, updates=updates)
        
        # fuction to reconstruct
        test_reconstruction = get_output(self.autoencoder, deterministic=True)
        self.reconstruction_fn = theano.function([input_var_first, input_var_second], test_reconstruction)
        
        # encoding function
        test_encode = get_output([encoder_first, encoder_second], deterministic=True)
        self.encoding_fn = theano.function([input_var_first, input_var_second], test_encode)

        # utils
        blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
        self.blas_nrm2 = blas('nrm2', np.array([], dtype=float))
        self.blas_scal = blas('scal', np.array([], dtype=float))

        # load weights if necessary
        if self.conf.load_model is not None:
            self.load_model()
    word_index, tokenizer = data_process.get_tokenizer(data, MAX_NB_WORDS,
                                                       MAX_SEQUENCE_LENGTH)
    train_x, train_y, train_le, train_labels, _, tr_ids = data_process.get_dev_data_with_id(
        train_file, tokenizer, MAX_SEQUENCE_LENGTH, delim)
    dev_x, dev_y, dev_le, dev_labels, _, dev_ids = data_process.get_dev_data_with_id(
        dev_file, tokenizer, MAX_SEQUENCE_LENGTH, delim)
    test_x, test_y, test_le, test_labels, _, test_ids = data_process.get_dev_data_with_id(
        test_file, tokenizer, MAX_SEQUENCE_LENGTH, delim)
    domain_test_x, domain_test_y, domain_test_le, domain_test_labels, _, domain_ids = data_process.get_dev_data_with_id(
        domain_test_file, tokenizer, MAX_SEQUENCE_LENGTH, delim)

    x_sym = T.imatrix('inputs1')
    l_in = lasagne.layers.InputLayer((None, MAX_SEQUENCE_LENGTH), x_sym)
    model2 = build_convpool_max(l_in, emb_model, word_index, MAX_NB_WORDS,
                                EMBEDDING_DIM, MAX_SEQUENCE_LENGTH)
    output = get_output(model2, x_sym)
    #
    cnn = theano.function([x_sym], output)
    train_x = cnn(train_x)
    print(train_x.shape)
    dev_x = cnn(dev_x)
    print(dev_x.shape)
    test_x = cnn(test_x)
    print(test_x.shape)
    R, C = train_x.shape
    train_y = train_y.astype('int32')
    dev_y = dev_y.astype('int32')
    domain_test_x = cnn(domain_test_x)

    print(train_x.shape)
    print(dev_x.shape)
Ejemplo n.º 32
0
def ff(input_data, input_mask, network):
    predict_data = get_output(network, deterministic=True)
    predict_fn = theano.function(inputs=[input_data, input_mask],
                                 outputs=[predict_data])

    return predict_fn
Ejemplo n.º 33
0
def event_span_classifier(args, input_var, target_var, wordEmbeddings, seqlen,
                          num_feats):

    print("Building model with 1D Convolution")

    vocab_size = wordEmbeddings.shape[1]
    wordDim = wordEmbeddings.shape[0]

    kw = 2
    num_filters = seqlen - kw + 1
    stride = 1

    #important context words as channels

    #CNN_sentence config
    filter_size = wordDim
    pool_size = seqlen - filter_size + 1

    input = InputLayer((None, seqlen, num_feats), input_var=input_var)
    batchsize, _, _ = input.input_var.shape
    emb = EmbeddingLayer(input,
                         input_size=vocab_size,
                         output_size=wordDim,
                         W=wordEmbeddings.T)
    #emb.params[emb.W].remove('trainable') #(batchsize, seqlen, wordDim)

    #print get_output_shape(emb)
    reshape = ReshapeLayer(emb, (batchsize, seqlen, num_feats * wordDim))
    #print get_output_shape(reshape)

    conv1d = Conv1DLayer(reshape,
                         num_filters=num_filters,
                         filter_size=wordDim,
                         stride=1,
                         nonlinearity=tanh,
                         W=GlorotUniform())  #nOutputFrame = num_flters,
    #nOutputFrameSize = (num_feats*wordDim-filter_size)/stride +1

    #print get_output_shape(conv1d)

    conv1d = DimshuffleLayer(conv1d, (0, 2, 1))

    #print get_output_shape(conv1d)

    pool_size = num_filters

    maxpool = MaxPool1DLayer(conv1d, pool_size=pool_size)

    #print get_output_shape(maxpool)

    #forward = FlattenLayer(maxpool)

    #print get_output_shape(forward)

    hid = DenseLayer(maxpool, num_units=args.hiddenDim, nonlinearity=sigmoid)

    network = DenseLayer(hid, num_units=2, nonlinearity=softmax)

    prediction = get_output(network)

    loss = T.mean(binary_crossentropy(prediction, target_var))
    lambda_val = 0.5 * 1e-4

    layers = {
        emb: lambda_val,
        conv1d: lambda_val,
        hid: lambda_val,
        network: lambda_val
    }
    penalty = regularize_layer_params_weighted(layers, l2)
    loss = loss + penalty

    params = get_all_params(network, trainable=True)

    if args.optimizer == "sgd":
        updates = sgd(loss, params, learning_rate=args.step)
    elif args.optimizer == "adagrad":
        updates = adagrad(loss, params, learning_rate=args.step)
    elif args.optimizer == "adadelta":
        updates = adadelta(loss, params, learning_rate=args.step)
    elif args.optimizer == "nesterov":
        updates = nesterov_momentum(loss, params, learning_rate=args.step)
    elif args.optimizer == "rms":
        updates = rmsprop(loss, params, learning_rate=args.step)
    elif args.optimizer == "adam":
        updates = adam(loss, params, learning_rate=args.step)
    else:
        raise "Need set optimizer correctly"

    test_prediction = get_output(network, deterministic=True)
    test_loss = T.mean(binary_crossentropy(test_prediction, target_var))

    train_fn = theano.function([input_var, target_var],
                               loss,
                               updates=updates,
                               allow_input_downcast=True)

    test_acc = T.mean(binary_accuracy(test_prediction, target_var))
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc],
                             allow_input_downcast=True)

    return train_fn, val_fn, network
 def log_likelihood_sym(self, x_var, y_var):
     normalized_xs_var = (x_var - self._x_mean_var) / self._x_std_var
     prob = L.get_output(
         self._l_prob, {self._prob_network.input_layer: normalized_xs_var})
     return self._dist.log_likelihood_sym(TT.cast(y_var, 'int32'),
                                          dict(prob=prob))
Ejemplo n.º 35
0
'''
# zca
whitener = ZCA(x=x_unlabelled)
sym_x_l_zca = whitener.apply(sym_x_l)
sym_x_eval_zca = whitener.apply(sym_x_eval)
sym_x_u_zca = whitener.apply(sym_x_u)
sym_x_u_rep_zca = whitener.apply(sym_x_u_rep)
sym_x_u_d_zca = whitener.apply(sym_x_u_d)

# init
lasagne.layers.get_output(classifier, sym_x_u_zca, init=True)
init_updates = [u for l in lasagne.layers.get_all_layers(classifier) for u in getattr(l, 'init_updates', [])]
init_fn = theano.function([sym_x_u], [], updates=init_updates)

# outputs
gen_out_x = ll.get_output(gen_layers[-1], {gen_in_y:sym_y_g, gen_in_z:sym_z_rand}, deterministic=False)
gen_out_x_zca = whitener.apply(gen_out_x)
cla_out_y_l = ll.get_output(classifier, sym_x_l_zca, deterministic=False)
cla_out_y_eval = ll.get_output(classifier, sym_x_eval_zca, deterministic=True)
cla_out_y = ll.get_output(classifier, sym_x_u_zca, deterministic=False)
cla_out_y_rep = ll.get_output(classifier, sym_x_u_rep_zca, deterministic=False)
bn_updates = [u for l in lasagne.layers.get_all_layers(classifier) for u in getattr(l, 'bn_updates', [])]

cla_out_y_d = ll.get_output(classifier, sym_x_u_d_zca, deterministic=False)
cla_out_y_d_hard = cla_out_y_d.argmax(axis=1)
cla_out_y_g = ll.get_output(classifier, gen_out_x_zca, deterministic=False)

dis_out_p = ll.get_output(dis_layers[-1], {dis_in_x:T.concatenate([sym_x_l,sym_x_u_d], axis=0),dis_in_y:T.concatenate([sym_y,cla_out_y_d_hard], axis=0)}, deterministic=False)
dis_out_p_g = ll.get_output(dis_layers[-1], {dis_in_x:gen_out_x,dis_in_y:sym_y_g}, deterministic=False)
# argmax
cla_out_y_hard = cla_out_y.argmax(axis=1)
    def __init__(
        self,
        env_spec,
        env,
        pkl_path=None,
        json_path=None,
        npz_path=None,
        trainable_snn=True,
        ##CF - latents units at the input
        latent_dim=3,  # we keep all these as the dim of the output of the other MLP and others that we will need!
        latent_name='categorical',
        bilinear_integration=False,  # again, needs to match!
        resample=False,  # this can change: frequency of resampling the latent?
        hidden_sizes_snn=(32, 32),
        hidden_sizes_selector=(10, 10),
        external_latent=False,
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
    ):
        self.latent_dim = latent_dim  ## could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes_snn = hidden_sizes_snn
        self.hidden_sizes_selector = hidden_sizes_selector

        self.pre_fix_latent = np.array(
            []
        )  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array(
            [])  # this will hold the latents variable sampled in reset()
        self.shared_latent_var = theano.shared(
            self.latent_fix)  # this is for external lat! update that
        self._set_std_to_0 = False

        self.trainable_snn = trainable_snn
        self.external_latent = external_latent
        self.pkl_path = pkl_path
        self.json_path = json_path
        self.npz_path = npz_path
        self.old_policy = None

        if self.json_path:  # there is another one after defining all the NN to warm-start the params of the SNN
            data = json.load(
                open(os.path.join(config.PROJECT_PATH, self.json_path),
                     'r'))  # I should do this with the json file
            self.old_policy_json = data['json_args']["policy"]
            self.latent_dim = self.old_policy_json['latent_dim']
            self.latent_name = self.old_policy_json['latent_name']
            self.bilinear_integration = self.old_policy_json[
                'bilinear_integration']
            self.resample = self.old_policy_json[
                'resample']  # this could not be needed...
            self.min_std = self.old_policy_json['min_std']
            self.hidden_sizes_snn = self.old_policy_json['hidden_sizes']
        elif self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH,
                                            self.pkl_path))
            self.old_policy = data["policy"]
            self.latent_dim = self.old_policy.latent_dim
            self.latent_name = self.old_policy.latent_name
            self.bilinear_integration = self.old_policy.bilinear_integration
            self.resample = self.old_policy.resample  # this could not be needed...
            self.min_std = self.old_policy.min_std
            self.hidden_sizes_snn = self.old_policy.hidden_sizes

        if self.latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim),
                                         log_std=np.zeros(self.latent_dim))
        elif self.latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif self.latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim *
                                             np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(
                    self.latent_dim))  # this is an empty array
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        # retrieve dimensions and check consistency
        if isinstance(env, MazeEnv) or isinstance(env, GatherEnv):
            self.obs_robot_dim = env.robot_observation_space.flat_dim
            self.obs_maze_dim = env.maze_observation_space.flat_dim
        elif isinstance(env, NormalizedEnv):
            if isinstance(env.wrapped_env, MazeEnv) or isinstance(
                    env.wrapped_env, GatherEnv):
                self.obs_robot_dim = env.wrapped_env.robot_observation_space.flat_dim
                self.obs_maze_dim = env.wrapped_env.maze_observation_space.flat_dim
            else:
                self.obs_robot_dim = env.wrapped_env.observation_space.flat_dim
                self.obs_maze_dim = 0
        else:
            self.obs_robot_dim = env.observation_space.flat_dim
            self.obs_maze_dim = 0
        # print("the dims of the env are(rob/maze): ", self.obs_robot_dim, self.obs_maze_dim)
        all_obs_dim = env_spec.observation_space.flat_dim
        assert all_obs_dim == self.obs_robot_dim + self.obs_maze_dim

        if self.external_latent:  # in case we want to fix the latent externally
            l_all_obs_var = L.InputLayer(
                shape=(None, ) + (self.obs_robot_dim + self.obs_maze_dim, ))
            all_obs_var = l_all_obs_var.input_var
            # l_selection = ConstOutputLayer(incoming=l_all_obs_var, output_var=self.shared_latent_var)
            l_selection = ParamLayer(incoming=l_all_obs_var,
                                     num_units=self.latent_dim,
                                     param=self.shared_latent_var,
                                     trainable=False)
            selection_var = L.get_output(l_selection)

        else:
            # create network with softmax output: it will be the latent 'selector'!
            latent_selection_network = MLP(
                input_shape=(self.obs_robot_dim + self.obs_maze_dim, ),
                output_dim=self.latent_dim,
                hidden_sizes=self.hidden_sizes_selector,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )
            l_all_obs_var = latent_selection_network.input_layer
            all_obs_var = latent_selection_network.input_layer.input_var

            # collect the output to select the behavior of the robot controller (equivalent to latents)
            l_selection = latent_selection_network.output_layer
            selection_var = L.get_output(l_selection)

        # split all_obs into the robot and the maze obs --> ROBOT goes first!!
        l_obs_robot = CropLayer(l_all_obs_var,
                                start_index=None,
                                end_index=self.obs_robot_dim)
        l_obs_maze = CropLayer(l_all_obs_var,
                               start_index=self.obs_robot_dim,
                               end_index=None)

        obs_robot_var = all_obs_var[:, :self.obs_robot_dim]
        obs_maze_var = all_obs_var[:, self.obs_robot_dim:]

        # Enlarge obs with the selectors (or latents). Here just computing the final input dim
        if self.bilinear_integration:
            l_obs_snn = BilinearIntegrationLayer([l_obs_robot, l_selection])
        else:
            l_obs_snn = L.ConcatLayer([l_obs_robot, l_selection])

        action_dim = env_spec.action_space.flat_dim

        # create the action network
        mean_network = MLP(
            input_layer=
            l_obs_snn,  # input the layer that handles the integration of the selector
            output_dim=action_dim,
            hidden_sizes=self.hidden_sizes_snn,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        self._layers_mean = mean_network.layers
        l_mean = mean_network.output_layer

        if adaptive_std:
            log_std_network = MLP(input_layer=l_obs_snn,
                                  output_dim=action_dim,
                                  hidden_sizes=std_hidden_sizes,
                                  hidden_nonlinearity=std_hidden_nonlinearity,
                                  output_nonlinearity=None,
                                  name="log_stdMLP")
            l_log_std = log_std_network.output_layer
            self._layers_log_std = log_std_network.layers
        else:
            l_log_std = ParamLayer(
                incoming=mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )
            self._layers_log_std = [l_log_std]

        self._layers_snn = self._layers_mean + self._layers_log_std  # this returns a list with the "snn" layers

        if not self.trainable_snn:
            for layer in self._layers_snn:
                for param, tags in layer.params.items(
                ):  # params of layer are OrDict: key=the shared var, val=tags
                    tags.remove("trainable")

        if self.json_path and self.npz_path:
            warm_params_dict = dict(
                np.load(os.path.join(config.PROJECT_PATH, self.npz_path)))
            # keys = list(param_dict.keys())
            self.set_params_snn(warm_params_dict)
        elif self.pkl_path:
            data = joblib.load(os.path.join(config.PROJECT_PATH,
                                            self.pkl_path))
            warm_params = data['policy'].get_params_internal()
            self.set_params_snn(warm_params)

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn_hier, self).__init__(env_spec)

        # debug
        obs_snn_var = L.get_output(l_obs_snn)
        self._l_obs_snn = ext.compile_function(
            inputs=[all_obs_var],
            outputs=obs_snn_var,
        )
        # self._log_std = ext.compile_function(
        #     inputs=[all_obs_var],
        #     outputs=log_std_var,
        # )
        self._mean = ext.compile_function(
            inputs=[all_obs_var],
            outputs=mean_var,
        )

        self._f_dist = ext.compile_function(
            inputs=[all_obs_var],
            outputs=[mean_var, log_std_var],
        )
        # if I want to monitor the selector output
        self._f_select = ext.compile_function(
            inputs=[all_obs_var],
            outputs=selection_var,
        )
    def __init__(self, babi_train_raw, babi_test_raw, word2vec,
                 word_vector_size, sent_vector_size, dim, mode, answer_module,
                 input_mask_mode, memory_hops, l2, normalize_attention,
                 batch_norm, dropout, dropout_in, **kwargs):

        print "==> not used params in DMN class:", kwargs.keys()
        self.vocab = {None: 0}
        self.ivocab = {0: None}

        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        self.sent_vector_size = sent_vector_size
        self.dim = dim
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout
        self.dropout_in = dropout_in

        self.max_inp_sent_len = 0
        self.max_q_len = 0
        """
        #To Use All Vocab
        self.vocab = {None: 0, 'jason': 134.0, 'office': 14.0, 'yellow': 78.0, 'bedroom': 24.0, 'go': 108.0, 'yes': 15.0, 'antoine': 138.0, 'milk': 139.0, 'before': 46.0, 'grabbed': 128.0, 'fit': 100.0, 'how': 105.0, 'swan': 73.0, 'than': 96.0, 'to': 13.0, 'does': 99.0, 's,e': 110.0, 'east': 102.0, 'rectangle': 82.0, 'gave': 149.0, 'then': 39.0, 'evening': 48.0, 'triangle': 79.0, 'garden': 37.0, 'get': 131.0, 'football,apple,milk': 179.0, 'they': 41.0, 'not': 178.0, 'bigger': 95.0, 'gray': 77.0, 'school': 6.0, 'apple': 142.0, 'did': 127.0, 'morning': 44.0, 'discarded': 146.0, 'julius': 72.0, 'she': 29.0, 'went': 11.0, 'where': 30.0, 'jeff': 152.0, 'square': 84.0, 'who': 153.0, 'tired': 124.0, 'there': 130.0, 'back': 12.0, 'lion': 70.0, 'are': 50.0, 'picked': 143.0, 'e,e': 119.0, 'pajamas': 129.0, 'Mary': 157.0, 'blue': 83.0, 'what': 63.0, 'container': 98.0, 'rhino': 76.0, 'daniel': 31.0, 'bernhard': 67.0, 'milk,football': 172.0, 'above': 80.0, 'got': 136.0, 'emily': 60.0, 'red': 88.0, 'either': 3.0, 'sheep': 58.0, 'football': 137.0, 'jessica': 61.0, 'do': 106.0, 'Bill': 155.0, 'football,apple': 168.0, 'fred': 1.0, 'winona': 59.0, 'objects': 161.0, 'put': 147.0, 'kitchen': 17.0, 'box': 90.0, 'received': 154.0, 'journeyed': 25.0, 'of': 52.0, 'wolf': 62.0, 'afternoon': 47.0, 'or': 7.0, 'south': 112.0, 's,w': 114.0, 'afterwards': 32.0, 'sumit': 123.0, 'color': 75.0, 'julie': 23.0, 'one': 163.0, 'down': 148.0, 'nothing': 167.0, 'n,n': 113.0, 'right': 86.0, 's,s': 116.0, 'gertrude': 54.0, 'bathroom': 26.0, 'from': 109.0, 'west': 104.0, 'chocolates': 91.0, 'two': 165.0, 'frog': 66.0, '.': 9.0, 'cats': 57.0, 'apple,milk,football': 175.0, 'passed': 158.0, 'apple,football,milk': 176.0, 'white': 71.0, 'john': 35.0, 'was': 45.0, 'mary': 10.0, 'apple,football': 170.0, 'north': 103.0, 'n,w': 111.0, 'that': 28.0, 'park': 8.0, 'took': 141.0, 'chocolate': 101.0, 'carrying': 162.0, 'n,e': 120.0, 'mice': 49.0, 'travelled': 22.0, 'he': 33.0, 'none': 164.0, 'bored': 133.0, 'e,n': 117.0, None: 0, 'Jeff': 159.0, 'this': 43.0, 'inside': 93.0, 'bill': 16.0, 'up': 144.0, 'cat': 64.0, 'will': 125.0, 'below': 87.0, 'greg': 74.0, 'three': 166.0, 'suitcase': 97.0, 'following': 36.0, 'e,s': 115.0, 'and': 40.0, 'thirsty': 135.0, 'cinema': 19.0, 'is': 2.0, 'moved': 18.0, 'yann': 132.0, 'sphere': 89.0, 'dropped': 145.0, 'in': 4.0, 'mouse': 56.0, 'football,milk': 171.0, 'pink': 81.0, 'afraid': 51.0, 'no': 20.0, 'Fred': 156.0, 'w,s': 121.0, 'handed': 151.0, 'w,w': 118.0, 'brian': 69.0, 'chest': 94.0, 'w,n': 122.0, 'you': 107.0, 'many': 160.0, 'lily': 65.0, 'hallway': 34.0, 'why': 126.0, 'after': 27.0, 'yesterday': 42.0, 'sandra': 38.0, 'fits': 92.0, 'milk,football,apple': 173.0, 'the': 5.0, 'milk,apple': 169.0, 'a': 55.0, 'give': 150.0, 'longer': 177.0, 'maybe': 21.0, 'hungry': 140.0, 'apple,milk': 174.0, 'green': 68.0, 'wolves': 53.0, 'left': 85.0}
        self.ivocab = {0: None, 1: 'fred', 2: 'is', 3: 'either', 4: 'in', 5: 'the', 6: 'school', 7: 'or', 8: 'park', 9: '.', 10: 'mary', 11: 'went', 12: 'back', 13: 'to', 14: 'office', 15: 'yes', 16: 'bill', 17: 'kitchen', 18: 'moved', 19: 'cinema', 20: 'no', 21: 'maybe', 22: 'travelled', 23: 'julie', 24: 'bedroom', 25: 'journeyed', 26: 'bathroom', 27: 'after', 28: 'that', 29: 'she', 30: 'where', 31: 'daniel', 32: 'afterwards', 33: 'he', 34: 'hallway', 35: 'john', 36: 'following', 37: 'garden', 38: 'sandra', 39: 'then', 40: 'and', 41: 'they', 42: 'yesterday', 43: 'this', 44: 'morning', 45: 'was', 46: 'before', 47: 'afternoon', 48: 'evening', 49: 'mice', 50: 'are', 51: 'afraid', 52: 'of', 53: 'wolves', 54: 'gertrude', 55: 'a', 56: 'mouse', 57: 'cats', 58: 'sheep', 59: 'winona', 60: 'emily', 61: 'jessica', 62: 'wolf', 63: 'what', 64: 'cat', 65: 'lily', 66: 'frog', 67: 'bernhard', 68: 'green', 69: 'brian', 70: 'lion', 71: 'white', 72: 'julius', 73: 'swan', 74: 'greg', 75: 'color', 76: 'rhino', 77: 'gray', 78: 'yellow', 79: 'triangle', 80: 'above', 81: 'pink', 82: 'rectangle', 83: 'blue', 84: 'square', 85: 'left', 86: 'right', 87: 'below', 88: 'red', 89: 'sphere', 90: 'box', 91: 'chocolates', 92: 'fits', 93: 'inside', 94: 'chest', 95: 'bigger', 96: 'than', 97: 'suitcase', 98: 'container', 99: 'does', 100: 'fit', 101: 'chocolate', 102: 'east', 103: 'north', 104: 'west', 105: 'how', 106: 'do', 107: 'you', 108: 'go', 109: 'from', 110: 's,e', 111: 'n,w', 112: 'south', 113: 'n,n', 114: 's,w', 115: 'e,s', 116: 's,s', 117: 'e,n', 118: 'w,w', 119: 'e,e', 120: 'n,e', 121: 'w,s', 122: 'w,n', 123: 'sumit', 124: 'tired', 125: 'will', 126: 'why', 127: 'did', 128: 'grabbed', 129: 'pajamas', 130: 'there', 131: 'get', 132: 'yann', 133: 'bored', 134: 'jason', 135: 'thirsty', 136: 'got', 137: 'football', 138: 'antoine', 139: 'milk', 140: 'hungry', 141: 'took', 142: 'apple', 143: 'picked', 144: 'up', 145: 'dropped', 146: 'discarded', 147: 'put', 148: 'down', 149: 'gave', 150: 'give', 151: 'handed', 152: 'jeff', 153: 'who', 154: 'received', 155: 'Bill', 156: 'Fred', 157: 'Mary', 158: 'passed', 159: 'Jeff', 160: 'many', 161: 'objects', 162: 'carrying', 163: 'one', 164: 'none', 165: 'two', 166: 'three', 167: 'nothing', 168: 'football,apple', 169: 'milk,apple', 170: 'apple,football', 171: 'football,milk', 172: 'milk,football', 173: 'milk,football,apple', 174: 'apple,milk', 175: 'apple,milk,football', 176: 'apple,football,milk', 177: 'longer', 178: 'not', 179: 'football,apple,milk'}
        #"""

        self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(
            babi_train_raw)
        self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(
            babi_test_raw)
        self.vocab_size = len(self.vocab)

        self.input_var = T.imatrix('input_var')
        self.q_var = T.ivector('question_var')
        self.answer_var = T.iscalar('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')

        self.attentions = []

        self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len)
        self.pe_matrix_q = self.pe_matrix(self.max_q_len)

        print "==> building input module"

        #positional encoder weights
        self.W_pe = nn_utils.normal_param(std=0.1,
                                          shape=(self.vocab_size, self.dim))

        #biGRU input fusion weights
        self.W_inp_res_in_fwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_res_fwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_upd_in_fwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_hid_in_fwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_res_in_bwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_res_bwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_upd_in_bwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_hid_in_bwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.inp_sent_reps, _ = theano.scan(fn=self.sum_pos_encodings_in,
                                            sequences=self.input_var)

        self.inp_sent_reps_stacked = T.stacklists(self.inp_sent_reps)

        self.inp_c = self.input_module_full(self.inp_sent_reps)

        self.q_q = self.sum_pos_encodings_q(self.q_var)

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 4 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem_raw = memory[-1].dimshuffle(('x', 0))

        net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net)[0]

        print "==> building answer module"
        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size, self.dim))

        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))

        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_upd_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_hid_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            def answer_step(prev_a, prev_y):
                a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
                                    self.W_ans_res_in, self.W_ans_res_hid,
                                    self.b_ans_res, self.W_ans_upd_in,
                                    self.W_ans_upd_hid, self.b_ans_upd,
                                    self.W_ans_hid_in, self.W_ans_hid_hid,
                                    self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]

            # add conditional ending?
            dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX))

            results, updates = theano.scan(
                fn=answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],
                n_steps=1)
            self.prediction = results[1][-1]

        else:
            raise Exception("invalid answer_module")

        print "==> collecting all parameters"
        self.params = [
            self.W_pe,
            self.W_inp_res_in_fwd,
            self.W_inp_res_hid_fwd,
            self.b_inp_res_fwd,
            self.W_inp_upd_in_fwd,
            self.W_inp_upd_hid_fwd,
            self.b_inp_upd_fwd,
            self.W_inp_hid_in_fwd,
            self.W_inp_hid_hid_fwd,
            self.b_inp_hid_fwd,
            self.W_inp_res_in_bwd,
            self.W_inp_res_hid_bwd,
            self.b_inp_res_bwd,
            self.W_inp_upd_in_bwd,
            self.W_inp_upd_hid_bwd,
            self.b_inp_upd_bwd,
            self.W_inp_hid_in_bwd,
            self.W_inp_hid_hid_bwd,
            self.b_inp_hid_bwd,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a
        ]

        if self.answer_module == 'recurrent':
            self.params = self.params + [
                self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid
            ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(
            self.prediction.dimshuffle('x', 0), T.stack([self.answer_var]))[0]

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adam(self.loss,
                                       self.params,
                                       learning_rate=0.0001,
                                       beta1=0.5)  #from DCGAN paper

        self.attentions = T.stack(self.attentions)
        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.input_mask_var
                ],
                outputs=[self.prediction, self.loss, self.attentions],
                updates=updates,
                on_unused_input='warn',
                allow_input_downcast=True)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[
                self.input_var, self.q_var, self.answer_var,
                self.input_mask_var
            ],
            outputs=[self.prediction, self.loss, self.attentions],
            on_unused_input='warn',
            allow_input_downcast=True)
Ejemplo n.º 38
0
    dense_2 = DenseLayer(dense_1, num_units=n_input, nonlinearity=tanh)

    probs = DenseLayer(dense_2, num_units=n_output, nonlinearity=softmax)

    return probs


X_state = T.fmatrix()
X_action = T.bvector()
X_reward = T.fvector()

X_action_hot = to_one_hot(X_action, n_output)

prob_values = policy_network(X_state)

policy_ = get_output(prob_values)
policy = theano.function(inputs=[X_state],
                         outputs=policy_,
                         allow_input_downcast=True)

loss = categorical_crossentropy(policy_, X_action_hot) * X_reward
loss = loss.mean()

params = get_all_params(prob_values)

updates = adam(loss, params, learning_rate=learning_rate)

update_network = theano.function(inputs=[X_state, X_action, X_reward],
                                 outputs=loss,
                                 updates=updates,
                                 allow_input_downcast=True)
Ejemplo n.º 39
0
def create_network(available_actions_count):
    # Create the input variables
    s1 = tensor.tensor4("State")
    a = tensor.vector("Action", dtype="int32")
    q2 = tensor.vector("Q2")
    r = tensor.vector("Reward")
    isterminal = tensor.vector("IsTerminal", dtype="int8")

    # Create the input layer of the network.
    dqn = InputLayer(shape=[None, 1, resolution[0], resolution[1]],
                     input_var=s1)

    # Add 2 convolutional layers with ReLu activation
    dqn = Conv2DLayer(dqn,
                      num_filters=8,
                      filter_size=[6, 6],
                      nonlinearity=rectify,
                      W=HeUniform("relu"),
                      b=Constant(.1),
                      stride=3)
    dqn = Conv2DLayer(dqn,
                      num_filters=8,
                      filter_size=[3, 3],
                      nonlinearity=rectify,
                      W=HeUniform("relu"),
                      b=Constant(.1),
                      stride=2)

    # Add a single fully-connected layer.
    dqn = DenseLayer(dqn,
                     num_units=128,
                     nonlinearity=rectify,
                     W=HeUniform("relu"),
                     b=Constant(.1))

    # Add the output layer (also fully-connected).
    # (no nonlinearity as it is for approximating an arbitrary real function)
    dqn = DenseLayer(dqn, num_units=available_actions_count, nonlinearity=None)

    # Define the loss function
    q = get_output(dqn)
    # target differs from q only for the selected action. The following means:
    # target_Q(s,a) = r + gamma * max Q(s2,_) if isterminal else r
    target_q = tensor.set_subtensor(
        q[tensor.arange(q.shape[0]), a],
        r + discount_factor * (1 - isterminal) * q2)
    loss = squared_error(q, target_q).mean()

    # Update the parameters according to the computed gradient using RMSProp.
    params = get_all_params(dqn, trainable=True)
    updates = rmsprop(loss, params, learning_rate)

    # Compile the theano functions
    print("Compiling the network ...")
    function_learn = theano.function([s1, q2, a, r, isterminal],
                                     loss,
                                     updates=updates,
                                     name="learn_fn")
    function_get_q_values = theano.function([s1], q, name="eval_fn")
    function_get_best_action = theano.function([s1],
                                               tensor.argmax(q),
                                               name="test_fn")
    print("Network compiled.")

    def simple_get_best_action(state):
        return function_get_best_action(
            state.reshape([1, 1, resolution[0], resolution[1]]))

    # Returns Theano objects for the net and functions.
    return dqn, function_learn, function_get_q_values, simple_get_best_action
Ejemplo n.º 40
0
def test_memory_cells(batch_size=3, seq_len=50, input_dim=8, n_hidden=16):
    # lasagne way
    l_in = InputLayer(
        (None, seq_len, input_dim),
        input_var=theano.shared(
            np.random.normal(size=[batch_size, seq_len, input_dim])),
        name='input seq')

    l_lstm0 = LSTMLayer(l_in, n_hidden, name='lstm')
    l_gru0 = GRULayer(l_in, n_hidden, name='gru')

    f_predict0 = theano.function([], get_output([l_lstm0, l_gru0]))

    # agentnet way
    s_in = InputLayer((None, input_dim), name='in')

    s_prev_cell = InputLayer((None, n_hidden), name='cell')
    s_prev_hid = InputLayer((None, n_hidden), name='hid')
    s_lstm_cell, s_lstm_hid = LSTMCell(s_prev_cell,
                                       s_prev_hid,
                                       s_in,
                                       name='lstm')

    s_prev_gru = InputLayer((None, n_hidden), name='hid')
    s_gru = GRUCell(s_prev_gru, s_in, name='gru')

    rec = Recurrence(state_variables=OrderedDict({
        s_lstm_cell: s_prev_cell,
        s_lstm_hid: s_prev_hid,
        s_gru: s_prev_gru
    }),
                     input_sequences={s_in: l_in},
                     unroll_scan=False)

    state_seqs, _ = rec.get_sequence_layers()

    l_lstm1 = state_seqs[s_lstm_hid]
    l_gru1 = state_seqs[s_gru]

    f_predict1 = theano.function([], get_output([l_lstm1, l_gru1]))

    # lstm param transfer
    old_params = sorted(get_all_params(l_lstm0, trainable=True),
                        key=lambda p: p.name)
    new_params = sorted(get_all_params(s_lstm_hid, trainable=True),
                        key=lambda p: p.name)

    for old, new in zip(old_params, new_params):
        print old.name, '<-', new.name
        assert tuple(old.shape.eval()) == tuple(new.shape.eval())
        old.set_value(new.get_value())

    # gru param transfer
    old_params = sorted(get_all_params(l_gru0, trainable=True),
                        key=lambda p: p.name)
    new_params = sorted(get_all_params(s_gru, trainable=True),
                        key=lambda p: p.name)

    for old, new in zip(old_params, new_params):
        print old.name, '<-', new.name
        assert tuple(old.shape.eval()) == tuple(new.shape.eval())
        old.set_value(new.get_value())

    lstm0_out, gru0_out = f_predict0()
    lstm1_out, gru1_out = f_predict1()

    assert np.allclose(lstm0_out, lstm1_out)
    assert np.allclose(gru0_out, gru1_out)
Ejemplo n.º 41
0
    def __init__(
        self,
        env_spec,
        latent_dim=2,
        latent_name='bernoulli',
        bilinear_integration=False,
        resample=False,
        hidden_sizes=(32, 32),
        learn_std=True,
        init_std=1.0,
        adaptive_std=False,
        std_share_network=False,
        std_hidden_sizes=(32, 32),
        std_hidden_nonlinearity=NL.tanh,
        hidden_nonlinearity=NL.tanh,
        output_nonlinearity=None,
        min_std=1e-4,
    ):
        """
        :param latent_dim: dimension of the latent variables
        :param latent_name: distribution of the latent variables
        :param bilinear_integration: Boolean indicator of bilinear integration or simple concatenation
        :param resample: Boolean indicator of resampling at every step or only at the start of the rollout (or whenever
        agent is reset, which can happen several times along the rollout with rollout in utils_snn)
        """
        # for _ in range(10):
        #     print("init!")
        # print("initilizaer run!")
        self.latent_dim = latent_dim  ##could I avoid needing this self for the get_action?
        self.latent_name = latent_name
        self.bilinear_integration = bilinear_integration
        self.resample = resample
        self.min_std = min_std
        self.hidden_sizes = hidden_sizes

        self.pre_fix_latent = np.array(
            []
        )  # if this is not empty when using reset() it will use this latent
        self.latent_fix = np.array(
            [])  # this will hold the latents variable sampled in reset()
        self._set_std_to_0 = False

        if latent_name == 'normal':
            self.latent_dist = DiagonalGaussian(self.latent_dim)
            self.latent_dist_info = dict(mean=np.zeros(self.latent_dim),
                                         log_std=np.zeros(self.latent_dim))
        elif latent_name == 'bernoulli':
            self.latent_dist = Bernoulli(self.latent_dim)
            self.latent_dist_info = dict(p=0.5 * np.ones(self.latent_dim))
        elif latent_name == 'categorical':
            self.latent_dist = Categorical(self.latent_dim)
            if self.latent_dim > 0:
                self.latent_dist_info = dict(prob=1. / self.latent_dim *
                                             np.ones(self.latent_dim))
            else:
                self.latent_dist_info = dict(prob=np.ones(self.latent_dim))
        else:
            raise NotImplementedError

        Serializable.quick_init(self, locals())
        assert isinstance(env_spec.action_space, Box)

        if self.bilinear_integration:
            obs_dim = env_spec.observation_space.flat_dim + latent_dim +\
                      env_spec.observation_space.flat_dim * latent_dim
        else:
            obs_dim = env_spec.observation_space.flat_dim + latent_dim  # here only if concat.

        action_dim = env_spec.action_space.flat_dim

        mean_network = MLP(
            input_shape=(obs_dim, ),
            output_dim=action_dim,
            hidden_sizes=hidden_sizes,
            hidden_nonlinearity=hidden_nonlinearity,
            output_nonlinearity=output_nonlinearity,
            name="meanMLP",
        )

        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if adaptive_std:
            l_log_std = MLP(input_shape=(obs_dim, ),
                            input_var=obs_var,
                            output_dim=action_dim,
                            hidden_sizes=std_hidden_sizes,
                            hidden_nonlinearity=std_hidden_nonlinearity,
                            output_nonlinearity=None,
                            name="log_stdMLP").output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        mean_var, log_std_var = L.get_output([l_mean, l_log_std])

        if self.min_std is not None:
            log_std_var = TT.maximum(log_std_var, np.log(self.min_std))

        self._l_mean = l_mean
        self._l_log_std = l_log_std

        self._dist = DiagonalGaussian(action_dim)

        LasagnePowered.__init__(self, [l_mean, l_log_std])
        super(GaussianMLPPolicy_snn, self).__init__(env_spec)

        self._f_dist = ext.compile_function(
            inputs=[obs_var],
            outputs=[mean_var, log_std_var],
        )

        # Sy: load policy
        self._layers_mean = mean_network.layers
        l_mean = mean_network.output_layer
        obs_var = mean_network.input_layer.input_var

        if adaptive_std:
            log_std_network = MLP(input_shape=(obs_dim, ),
                                  input_var=obs_var,
                                  output_dim=action_dim,
                                  hidden_sizes=std_hidden_sizes,
                                  hidden_nonlinearity=std_hidden_nonlinearity,
                                  output_nonlinearity=None,
                                  name="log_stdMLP")
            l_log_std = log_std_network.output_layer
            self._layers_log_std = log_std_network.layers
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=action_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )
            self._layers_log_std = [l_log_std]
        self._layers_snn = self._layers_mean + self._layers_log_std  # this returns a list with the "snn" layers
Ejemplo n.º 42
0
    def __init__(self, train_raw, dev_raw, test_raw, word2vec,
                 word_vector_size, answer_module, dim, mode, input_mask_mode,
                 memory_hops, l2, normalize_attention, dropout, **kwargs):
        print "generate sentence answer for mctest"
        print "==> not used params in DMN class:", kwargs.keys()
        self.word2vec = word2vec
        self.word_vector_size = word_vector_size
        # add eng_of_sentence tag for answer generation
        #self.end_tag = len(word2vec)
        #self.vocab_size = self.end_tag+1
        self.vocab_size = len(word2vec)

        self.dim = dim  # hidden state size
        self.mode = mode
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.answer_module = answer_module
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.dropout = dropout

        self.train_input, self.train_q, self.train_answer, self.train_input_mask, self.train_max_n = self._process_input(
            train_raw)
        self.dev_input, self.dev_q, self.dev_answer, self.dev_input_mask, self.dev_max_n = self._process_input(
            dev_raw)
        self.test_input, self.test_q, self.test_answer, self.test_input_mask, self.test_max_n = self._process_input(
            test_raw)

        self.input_var = T.matrix('input_var')
        self.q_var = T.matrix('question_var')
        self.answer_var = T.ivector('answer_var')
        self.input_mask_var = T.ivector('input_mask_var')
        self.max_n = T.iscalar('max_n')

        self.attentions = []

        print "==> building input module"
        self.W_inp_res_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_upd_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_inp_hid_in = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.word_vector_size))
        self.W_inp_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_inp_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        inp_c_history, _ = theano.scan(fn=self.input_gru_step,
                                       sequences=self.input_var,
                                       outputs_info=T.zeros_like(
                                           self.b_inp_hid))

        self.inp_c = inp_c_history.take(self.input_mask_var, axis=0)

        self.q_q, _ = theano.scan(fn=self.input_gru_step,
                                  sequences=self.q_var,
                                  outputs_info=T.zeros_like(self.b_inp_hid))

        self.q_q = self.q_q[-1]

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0, shape=(self.dim, ))

        self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.dim, 7 * self.dim + 2))
        self.W_2 = nn_utils.normal_param(std=0.1, shape=(1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0, shape=(self.dim, ))
        self.b_2 = nn_utils.constant_param(value=0.0, shape=(1, ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in, self.W_mem_res_hid,
                                self.b_mem_res, self.W_mem_upd_in,
                                self.W_mem_upd_hid, self.b_mem_upd,
                                self.W_mem_hid_in, self.W_mem_hid_hid,
                                self.b_mem_hid))

        last_mem_raw = memory[-1].dimshuffle(('x', 0))

        net = layers.InputLayer(shape=(1, self.dim), input_var=last_mem_raw)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net)[0]
        self.attentions = T.stack(self.attentions)

        print "==> building answer module"
        self.W_a = nn_utils.normal_param(std=0.1,
                                         shape=(self.vocab_size, self.dim))

        if self.answer_module == 'feedforward':
            self.prediction = nn_utils.softmax(T.dot(self.W_a, last_mem))
            self.prediction = self.prediction.dimshuffle('x', 0)

        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_upd_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_hid_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            def answer_step(prev_a, prev_y):
                a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
                                    self.W_ans_res_in, self.W_ans_res_hid,
                                    self.b_ans_res, self.W_ans_upd_in,
                                    self.W_ans_upd_hid, self.b_ans_upd,
                                    self.W_ans_hid_in, self.W_ans_hid_hid,
                                    self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [
                    a, y
                ]  #, theano.scan_module.until(n>=max_n)) # or argmax==self.end_tag)

            dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX))
            results, updates = theano.scan(
                fn=answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],
                n_steps=self.max_n)
            self.prediction = results[1]
        else:
            raise Exception("invalid answer_module")

        print "==> collecting all parameters"
        self.params = [
            self.W_inp_res_in, self.W_inp_res_hid, self.b_inp_res,
            self.W_inp_upd_in, self.W_inp_upd_hid, self.b_inp_upd,
            self.W_inp_hid_in, self.W_inp_hid_hid, self.b_inp_hid,
            self.W_mem_res_in, self.W_mem_res_hid, self.b_mem_res,
            self.W_mem_upd_in, self.W_mem_upd_hid, self.b_mem_upd,
            self.W_mem_hid_in, self.W_mem_hid_hid, self.b_mem_hid, self.W_b,
            self.W_1, self.W_2, self.b_1, self.b_2, self.W_a
        ]

        if self.answer_module == 'recurrent':  #feedforward':
            self.params = self.params + [
                self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid
            ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(self.prediction,
                                                       self.answer_var).sum()

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = self.loss_ce + self.loss_l2

        updates = lasagne.updates.adam(self.loss, self.params)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003)

        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var,
                    self.input_mask_var, self.max_n
                ],
                allow_input_downcast=True,
                outputs=[self.prediction, self.loss],
                updates=updates)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[
                self.input_var, self.q_var, self.answer_var,
                self.input_mask_var, self.max_n
            ],
            allow_input_downcast=True,
            outputs=[self.prediction, self.loss, self.attentions])
Ejemplo n.º 43
0
    def compile_theano_functions(self, data_type='2D'):
        assert self.net != None

        ### symbolic theano input
        theano_args = OrderedDict()
        dim = len(self.cf.dim)

        if data_type == '2D':
            assert dim == 2
            theano_args['X'] = T.tensor4()
            theano_args['y'] = T.dmatrix()
            self.logger.info('Net: Working with 2D data.')

        elif data_type == '3D':
            assert dim == 3
            theano_args['X'] = T.tensor5()
            theano_args['y'] = T.ivector()
            self.logger.info('Net: Working with 3D data.')

        val_args = deepcopy(theano_args)
        train_args = deepcopy(theano_args)
        train_args['lr'] = T.scalar(name='lr')

        ### prediction functions

        # get softmax prediction of shape (b, classes)
        prediction_train = get_output(self.net[self.cf.out_layer],
                                      train_args['X'],
                                      deterministic=False)
        prediction_val = get_output(self.net[self.cf.out_layer],
                                    val_args['X'],
                                    deterministic=True)

        self.predict['train'] = theano.function([train_args['X']],
                                                prediction_train)
        self.predict['val'] = theano.function([val_args['X']], prediction_val)

        ### l2 loss
        self.loss['train'] = squared_error(prediction_train,
                                           train_args['y']).mean()
        self.loss['val'] = squared_error(prediction_val, val_args['y']).mean()

        if self.cf.use_weight_decay:
            training_loss = self.loss['train'] +\
                self.cf.weight_decay * lasagne.regularization.regularize_network_params(self.net[self.cf.out_layer],
                             lasagne.regularization.l2)
            self.logger.info('Net: Using weight decay of {}.'.format(
                self.cf.weight_decay))
        else:
            training_loss = self.loss['train']

        ### accuracy
        # train_acc = T.mean(T.eq(T.argmax(prediction_train_smax, axis=1), train_args['y']))
        # val_acc = T.mean(T.eq(T.argmax(prediction_val_smax, axis=1), val_args['y']))

        ### training functions
        params = get_all_params(self.net[self.cf.out_layer], trainable=True)
        grads = theano.grad(training_loss, params)
        updates = adam(grads, params, learning_rate=train_args['lr'])

        self.train_fn = theano.function(train_args.values(),
                                        [self.loss['train'], prediction_train],
                                        updates=updates)
        self.val_fn = theano.function(val_args.values(),
                                      [self.loss['val'], prediction_val])

        self.logger.info('Net: Compiled theano functions.')
Ejemplo n.º 44
0
Archivo: DRHN.py Proyecto: yuan776/DRHN
def main(n=5, k=12, num_epochs=50, model=None):
    # Check if cifar data exists
    print("n= ", n, " k= ", k)

    if not os.path.exists("./cifar-10-batches-py"):
        print(
            "CIFAR-10 dataset can not be found. Please download the dataset from 'https://www.cs.toronto.edu/~kriz/cifar.html'."
        )
        return

    # Load the dataset
    print("Loading data...")
    data = load_data()
    X_train = data['X_train']
    Y_train = data['Y_train']
    X_test = data['X_test']
    Y_test = data['Y_test']

    # Prepare Theano variables for inputs and targets
    input_var = T.tensor4('inputs')
    target_var = T.ivector('targets')

    # Create neural network model
    print("Building model and compiling functions...")
    network = build_cnn(input_var, n, k)
    print("number of parameters in model: %d" %
          lasagne.layers.count_params(network, trainable=True))

    if model is None:
        # Create a loss expression for training, i.e., a scalar objective we want
        # to minimize (for our multi-class problem, it is the cross-entropy loss):
        prediction = lasagne.layers.get_output(network)
        loss = lasagne.objectives.categorical_crossentropy(
            prediction, target_var)
        loss = loss.mean()
        # add weight decay
        all_layers = lasagne.layers.get_all_layers(network)
        l2_penalty = lasagne.regularization.regularize_layer_params(
            all_layers, lasagne.regularization.l2) * 0.0001
        loss = loss + l2_penalty

        # Create update expressions for training
        # Stochastic Gradient Descent (SGD) with momentum
        params = lasagne.layers.get_all_params(network, trainable=True)
        lr = 0.1
        sh_lr = theano.shared(lasagne.utils.floatX(lr))
        updates = lasagne.updates.momentum(loss,
                                           params,
                                           learning_rate=sh_lr,
                                           momentum=0.9)

        # Compile a function performing a training step on a mini-batch (by giving
        # the updates dictionary) and returning the corresponding training loss:
        train_fn = theano.function([input_var, target_var],
                                   loss,
                                   updates=updates)

    # Create a loss expression for validation/testing
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])

    ###function for prediction
    predict_class = theano.function(inputs=[input_var],
                                    outputs=test_prediction)

    ####function to generate the related hash-code for the images
    layers = lasagne.layers.get_all_layers(network)
    for l in layers:
        if l.name == 'hash_layer':
            hash_layer_out = l

    prediction_hash = T.switch(
        T.le(get_output(hash_layer_out, input_var), 0.5), 0., 1.)
    predict_hash = theano.function([input_var], prediction_hash)

    #####################1. TRAINING OR LOADING THE MODEL ########################
    if model is None:
        validation_loss = 10
        # launch the training loop
        print("Starting training...")
        # We iterate over epochs:
        for epoch in range(num_epochs):
            # shuffle training data
            train_indices = np.arange(100000)
            np.random.shuffle(train_indices)
            X_train = X_train[train_indices, :, :, :]
            Y_train = Y_train[train_indices]

            # In each epoch, we do a full pass over the training data:
            train_err = 0
            train_batches = 0
            start_time = time.time()
            for batch in iterate_minibatches(X_train,
                                             Y_train,
                                             128,
                                             shuffle=True,
                                             augment=True):
                inputs, targets = batch
                train_err += train_fn(inputs, targets)
                train_batches += 1

            # And a full pass over the validation data:
            val_err = 0
            val_acc = 0
            val_batches = 0
            for batch in iterate_minibatches(X_test,
                                             Y_test,
                                             500,
                                             shuffle=False):
                inputs, targets = batch
                err, acc = val_fn(inputs, targets)
                val_err += err
                val_acc += acc
                val_batches += 1

            # Then we print the results for this epoch:
            print("Epoch {} of {} took {:.3f}s".format(
                epoch + 1, num_epochs,
                time.time() - start_time))
            print("  training loss:\t\t{:.6f}".format(train_err /
                                                      train_batches))
            print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))
            if val_err / val_batches < validation_loss:
                validation_loss = val_err / val_batches
                np.savez(
                    'cifar10_deep_residual_hashing_n' + str(n) + '_k' +
                    str(k) + '.npz',
                    *lasagne.layers.get_all_param_values(network))
                print("guardando modelo...")
            print("  validation accuracy:\t\t{:.2f} %".format(
                val_acc / val_batches * 100))

            # adjust learning rate as in paper
            # 32k and 48k iterations should be roughly equivalent to 41 and 61 epochs
            if (epoch + 1) == 41 or (epoch + 1) == 61:
                new_lr = sh_lr.get_value() * 0.1
                print("New LR:" + str(new_lr))
                sh_lr.set_value(lasagne.utils.floatX(new_lr))

        # dump the network weights to a file :
        #np.savez('cifar10_deep_residual_model.npz', *lasagne.layers.get_all_param_values(network))
    else:
        # load network weights from model file
        print('Loading MODEL pre-trained')
        with np.load(model) as f:
            param_values = [f['arr_%d' % i] for i in range(len(f.files))]
        lasagne.layers.set_all_param_values(network, param_values)
        validation_of_the_model()

    # Calculate validation error of model:
    def validation_of_the_model():
        test_err = 0
        test_acc = 0
        test_batches = 0
        for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            test_err += err
            test_acc += acc
            test_batches += 1
        print("Final results:")
        print("  test loss:\t\t\t{:.6f}".format(test_err / test_batches))
        print("  test accuracy:\t\t{:.2f} %".format(test_acc / test_batches *
                                                    100))

    #####################2. GENERATION OF THE CODES ##########################
    def save_obj(obj, name):
        with open(name + '.pkl', 'wb') as f:
            pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

    def load_obj(name):
        with open(name + '.pkl', 'rb') as f:
            return pickle.load(f)

    hashes_training = []
    index = 0

    #generating codes for training images
    for batch in iterate_minibatches(X_train, Y_train, 500, shuffle=False):
        inputs, targets = batch
        pred = predict_hash(inputs).astype(int)

        for element in pred:
            hashes_training.append((index, element, Y_train[index]))
            index += 1
    save_obj(hashes_training,
             'cifar10_n' + str(n) + '_hash' + str(k) + 'k_codes')

    hashes_testing = []
    index = 0
    #generating codes for testing images
    for batch in iterate_minibatches(X_test, Y_test, 500, shuffle=False):
        inputs, targets = batch
        pred = predict_hash(inputs).astype(int)

        for element in pred:
            hashes_testing.append((index, element, Y_test[index]))
            index += 1
    save_obj(hashes_testing,
             'cifar10_n' + str(n) + '_hash' + str(k) + 'k_test_codes')

    ########################3. MAP evaluation #############################
    #for all elements in test_examples (should be a list of len=10000)
    #MAP evaluation as shown in https://github.com/kevinlin311tw/caffe-cvprw15
    k_ = 1000
    NS = np.arange(1, k_ + 1)
    sum_tp = np.zeros(len(NS))
    QueryTimes = 10000
    AP = np.zeros(QueryTimes)

    index_of_query = 0
    for image_test in hashes_testing:
        for index in range(len(hashes_training)):
            hashes_training[index] = (
                hashes_training[index][0], hashes_training[index][1],
                hashes_training[index][2],
                np.count_nonzero(image_test[1] != hashes_training[index][1])
            )  #hamming2(image_test[1],hashes_training[index][1]))

        from operator import itemgetter
        hashes_training.sort(key=itemgetter(3))

        #comenzamos
        buffer_yes = np.zeros(k_)
        total_relevant = 0
        for i in range(k_):
            #si la etiqueta es igual, sumar
            if hashes_training[i][2] == image_test[2]:
                buffer_yes[i] = 1
                total_relevant += 1
        #print (total_relevant)
        P = np.divide(np.cumsum(buffer_yes), NS, dtype=float)
        if np.sum(buffer_yes, axis=0) == 0:
            AP[index_of_query] = 0
        else:
            AP[index_of_query] = np.sum(np.multiply(P, buffer_yes),
                                        axis=0) / np.sum(buffer_yes, axis=0)
        #print (index_of_query, AP[index_of_query])
        sum_tp = sum_tp + np.cumsum(buffer_yes)
        index_of_query += 1

    precision_at_k = np.divide(sum_tp, NS * QueryTimes)
    map_ = np.mean(AP)
    print('precision_at_k', precision_at_k)  #array de valores
    save_obj(precision_at_k, 'precision_at_k_n' + str(n) + '_k' + str(k))
    print('map', map_)  #valor numerico
    save_obj(map_, 'map_n' + str(n) + '_k' + str(k))
    print('n' + str(n) + 'k' + str(k))
Ejemplo n.º 45
0
# fixed random seeds
rng_data = np.random.RandomState(args.seed_data)
rng = np.random.RandomState(args.seed)
theano_rng = MRG_RandomStreams(rng.randint(2**15))
lasagne.random.set_rng(np.random.RandomState(rng.randint(2**15)))

# load CIFAR-10
test_matched, test_unmatched = get_data_patches_test(args.test_data,
                                                     args.data_dir)

trainx = get_data_patches_training(args.data_name, args.data_dir)
trainx = trainx[rng.permutation(trainx.shape[0])]

# specify generative model
gen_layers = get_generator(args.batch_size, theano_rng)
gen_dat = ll.get_output(gen_layers[-1])

# specify discriminative model
disc_layers, f_low_dim, _ = get_discriminator_brown(args.num_features)

load_model(gen_layers, args.generator_out)
load_model(disc_layers, args.discriminator_out)

x_temp = T.tensor4()

# Test generator in sampling procedure
samplefun = th.function(inputs=[], outputs=gen_dat)
sample_x = []
for k in range(20):
    sample_x.append(samplefun())
sample_x = np.concatenate(sample_x, axis=0)
Ejemplo n.º 46
0
    def __init__(self,
                 env_spec,
                 hidden_dim=32,
                 feature_network=None,
                 state_include_action=True,
                 hidden_nonlinearity=NL.tanh):
        """
        :param env_spec: A spec for the env.
        :param hidden_dim: dimension of hidden layer
        :param hidden_nonlinearity: nonlinearity used for each hidden layer
        :return:
        """
        assert isinstance(env_spec.action_space, Discrete)
        Serializable.quick_init(self, locals())
        super(CategoricalGRUPolicy, self).__init__(env_spec)

        obs_dim = env_spec.observation_space.flat_dim
        action_dim = env_spec.action_space.flat_dim

        if state_include_action:
            input_dim = obs_dim + action_dim
        else:
            input_dim = obs_dim

        l_input = L.InputLayer(shape=(None, None, input_dim), name="input")

        if feature_network is None:
            feature_dim = input_dim
            l_flat_feature = None
            l_feature = l_input
        else:
            feature_dim = feature_network.output_layer.output_shape[-1]
            l_flat_feature = feature_network.output_layer
            l_feature = OpLayer(
                l_flat_feature,
                extras=[l_input],
                name="reshape_feature",
                op=lambda flat_feature, input: TT.reshape(
                    flat_feature,
                    [input.shape[0], input.shape[1], feature_dim]),
                shape_op=lambda _, input_shape:
                (input_shape[0], input_shape[1], feature_dim))

        prob_network = GRUNetwork(input_shape=(feature_dim, ),
                                  input_layer=l_feature,
                                  output_dim=env_spec.action_space.n,
                                  hidden_dim=hidden_dim,
                                  hidden_nonlinearity=hidden_nonlinearity,
                                  output_nonlinearity=TT.nnet.softmax,
                                  name="prob_network")

        self.prob_network = prob_network
        self.feature_network = feature_network
        self.l_input = l_input
        self.state_include_action = state_include_action

        flat_input_var = TT.matrix("flat_input")
        if feature_network is None:
            feature_var = flat_input_var
        else:
            feature_var = L.get_output(
                l_flat_feature, {feature_network.input_layer: flat_input_var})

        self.f_step_prob = ext.compile_function(
            [flat_input_var, prob_network.step_prev_hidden_layer.input_var],
            L.get_output([
                prob_network.step_output_layer, prob_network.step_hidden_layer
            ], {prob_network.step_input_layer: feature_var}))

        self.input_dim = input_dim
        self.action_dim = action_dim
        self.hidden_dim = hidden_dim

        self.prev_action = None
        self.prev_hidden = None
        self.dist = RecurrentCategorical(env_spec.action_space.n)

        out_layers = [prob_network.output_layer]
        if feature_network is not None:
            out_layers.append(feature_network.output_layer)

        LasagnePowered.__init__(self, out_layers)
Ejemplo n.º 47
0
    ffn3 = icnn3

    ffn4 = LL.ConcatLayer([inp,ffn1,ffn2,ffn3],axis=1, cropping=None);

    ffn = LL.DenseLayer(ffn4, nclasses, nonlinearity=utils_lasagne.log_softmax)
    return ffn




inp = LL.InputLayer(shape=(None, nin))
patch_op = LL.InputLayer(input_var=Tsp.csc_fmatrix('patch_op'), shape=(None, None))
print(patch_op.shape[0])
ffn = get_model(inp, patch_op)

output = LL.get_output(ffn)
pred = LL.get_output(ffn, deterministic=True)

target = T.ivector('idxs')
cla = utils_lasagne.categorical_crossentropy_logdomain(output, target, nclasses).mean()
acc = LO.categorical_accuracy(pred, target).mean()
regL2 = L.regularization.regularize_network_params(ffn, L.regularization.l2)


cost = cla + l2_weight * regL2
params = LL.get_all_params(ffn, trainable=True)
grads = T.grad(cost, params)
grads_norm = T.nlinalg.norm(T.concatenate([g.flatten() for g in grads]), 2)
updates = L.updates.adam(grads, params, learning_rate=0.001)
funcs = dict()
Ejemplo n.º 48
0
nb_valid_batch = 4
batch_size = 5

######################
# Building the model #
######################

# Symbolic variables
x = T.tensor4('x', dtype=theano.config.floatX)

# Creating the model
model = build_model2(input_var=x)
with np.load(data_path + 'best_cnn_model.npz') as f:
    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
layers.set_all_param_values(model, param_values)
output = layers.get_output(model, deterministic=True)

# Creating theano function
predict_target = theano.function(
    [x],
    output,
    allow_input_downcast=True,
)

######################
# Predict the target #
######################

for i in range(nb_valid_batch):
    input, target = get_image(data_path, valid_input_path, valid_target_path,
                              str(i))
 def dist_info_sym(self, obs_var, state_info_var=None):
     mean_var, log_std_var = L.get_output([self._l_mean, self._l_log_std],
                                          obs_var)
     if self.min_std is not None:
         log_std_var = TT.maximum(log_std_var, np.log(self.min_std))
     return dict(mean=mean_var, log_std=log_std_var)
Ejemplo n.º 50
0
    def __init__(self,
                 input_vars,
                 target_vars,
                 l_out,
                 loss,
                 optimizer,
                 learning_rate=0.001,
                 id=None):
        if not isinstance(input_vars, Sequence):
            raise ValueError(
                'input_vars should be a sequence, instead got %s' %
                (input_vars, ))
        if not isinstance(target_vars, Sequence):
            raise ValueError(
                'target_vars should be a sequence, instead got %s' %
                (input_vars, ))

        self.get_options()

        self.input_vars = input_vars
        self.l_out = l_out
        self.loss = loss
        self.optimizer = optimizer
        self.id = id
        id_tag = (self.id + '/') if self.id else ''
        id_tag_log = (self.id + ': ') if self.id else ''

        if self.options.verbosity >= 6:
            output_model_structure(l_out)

        params = self.params()
        (monitored, train_loss_grads,
         synth_vars) = self.get_train_loss(target_vars, params)
        self.monitored_tags = monitored.keys()

        if self.options.true_grad_clipping:
            scaled_grads = total_norm_constraint(
                train_loss_grads, self.options.true_grad_clipping)
        else:
            scaled_grads = train_loss_grads

        updates = optimizer(scaled_grads, params, learning_rate=learning_rate)
        self.optimizer_vars = [var for var in updates if var not in params]
        if not self.options.no_nan_suppression:
            # TODO: print_mode='all' somehow is always printing, even when
            # there are no NaNs. But tests are passing, even on GPU!
            updates = apply_nan_suppression(updates, print_mode='none')

        if self.options.detect_nans:
            mode = MonitorMode(post_func=detect_nan)
        else:
            mode = None

        if self.options.verbosity >= 2:
            print(id_tag_log + 'Compiling training function')
        params = input_vars + target_vars + synth_vars
        if self.options.verbosity >= 6:
            print('params = %s' % (params, ))
        self.train_fn = theano.function(params,
                                        monitored.values(),
                                        updates=updates,
                                        mode=mode,
                                        name=id_tag + 'train',
                                        on_unused_input='warn')
        if self.options.run_dir and not self.options.no_graphviz:
            self.visualize_graphs({'loss': monitored['loss']},
                                  out_dir=self.options.run_dir)

        test_prediction = get_output(l_out, deterministic=True)
        if self.options.verbosity >= 2:
            print(id_tag_log + 'Compiling prediction function')
        if self.options.verbosity >= 6:
            print('params = %s' % (input_vars, ))
        self.predict_fn = theano.function(input_vars,
                                          test_prediction,
                                          mode=mode,
                                          name=id_tag + 'predict',
                                          on_unused_input='ignore')

        if self.options.run_dir and not self.options.no_graphviz:
            self.visualize_graphs({'test_prediction': test_prediction},
                                  out_dir=self.options.run_dir)
    def __init__(
        self,
        input_shape,
        output_dim,
        prob_network=None,
        hidden_sizes=(32, 32),
        hidden_nonlinearity=NL.rectify,
        optimizer=None,
        use_trust_region=True,
        step_size=0.01,
        normalize_inputs=True,
        name=None,
    ):
        """
        :param input_shape: Shape of the input data.
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        """
        Serializable.quick_init(self, locals())

        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer()
            else:
                optimizer = LbfgsOptimizer()

        self.output_dim = output_dim
        self._optimizer = optimizer

        if prob_network is None:
            prob_network = MLP(
                input_shape=input_shape,
                output_dim=output_dim,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=NL.softmax,
            )

        l_prob = prob_network.output_layer

        LasagnePowered.__init__(self, [l_prob])

        xs_var = prob_network.input_layer.input_var
        ys_var = TT.imatrix("ys")
        old_prob_var = TT.matrix("old_prob")

        x_mean_var = theano.shared(np.zeros((1, ) + input_shape),
                                   name="x_mean",
                                   broadcastable=(True, ) +
                                   (False, ) * len(input_shape))
        x_std_var = theano.shared(np.ones((1, ) + input_shape),
                                  name="x_std",
                                  broadcastable=(True, ) +
                                  (False, ) * len(input_shape))

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var

        prob_var = L.get_output(l_prob,
                                {prob_network.input_layer: normalized_xs_var})

        old_info_vars = dict(prob=old_prob_var)
        info_vars = dict(prob=prob_var)

        dist = self._dist = Categorical(output_dim)

        mean_kl = TT.mean(dist.kl_sym(old_info_vars, info_vars))

        loss = -TT.mean(dist.log_likelihood_sym(ys_var, info_vars))

        predicted = special.to_onehot_sym(TT.argmax(prob_var, axis=1),
                                          output_dim)

        self._f_predict = ext.compile_function([xs_var], predicted)
        self._f_prob = ext.compile_function([xs_var], prob_var)
        self._prob_network = prob_network
        self._l_prob = l_prob

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[prob_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [xs_var, ys_var, old_prob_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
gen_layers.append(
    nn.batch_norm(LL.DenseLayer(gen_layers[-1],
                                num_units=500,
                                nonlinearity=T.nnet.softplus),
                  g=None))
gen_layers.append(
    nn.batch_norm(LL.DenseLayer(gen_layers[-1],
                                num_units=500,
                                nonlinearity=T.nnet.softplus),
                  g=None))
gen_layers.append(
    nn.l2normalize(
        LL.DenseLayer(gen_layers[-1],
                      num_units=28**2,
                      nonlinearity=T.nnet.sigmoid)))
gen_dat = LL.get_output(gen_layers[-1], deterministic=False)

# specify supervised model
layers = [LL.InputLayer(shape=(None, 28**2))]
layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.3))
layers.append(nn.DenseLayer(layers[-1], num_units=1000))
layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5))
layers.append(nn.DenseLayer(layers[-1], num_units=500))
layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5))
layers.append(nn.DenseLayer(layers[-1], num_units=250))
layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5))
layers.append(nn.DenseLayer(layers[-1], num_units=250))
layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5))
layers.append(nn.DenseLayer(layers[-1], num_units=250))
layers.append(nn.GaussianNoiseLayer(layers[-1], sigma=0.5))
layers.append(
Ejemplo n.º 53
0
def define_model(input_var, **kwargs):
    """ Defines the model and returns (network, validation network output)
        
    -Return layers.get_output(final_layer_name) if validation network output and 
        train network output are the same
    
    -For example, return layers.get_output(final_layer_name, deterministic = true) 
        if there is a dropout layer
            
    -Use **kwargs to pass model specific parameters
    """

    conv1_filter_count = 100
    conv1_filter_size = 5
    pool1_size = 2

    n_dense_units = 3000

    batch_size = input_var.shape[0]
    image_size = 32
    after_conv1 = image_size
    after_pool1 = (after_conv1 + pool1_size - 1) // pool1_size

    input = layers.InputLayer(shape=(None, 3, image_size, image_size),
                              input_var=input_var)

    greyscale_input = our_layers.GreyscaleLayer(
        incoming=input,
        random_greyscale=True,
    )

    conv1 = layers.Conv2DLayer(
        incoming=greyscale_input,
        num_filters=conv1_filter_count,
        filter_size=conv1_filter_size,
        stride=1,
        pad='same',
        nonlinearity=lasagne.nonlinearities.sigmoid,
    )

    pool1 = layers.MaxPool2DLayer(
        incoming=conv1,
        pool_size=pool1_size,
        stride=pool1_size,
    )

    dense1 = layers.DenseLayer(
        incoming=pool1,
        num_units=n_dense_units,
        nonlinearity=lasagne.nonlinearities.rectify,
    )

    pre_unpool1 = layers.DenseLayer(
        incoming=dense1,
        num_units=conv1_filter_count * (after_pool1**2),
        nonlinearity=lasagne.nonlinearities.linear,
    )

    pre_unpool1 = layers.ReshapeLayer(
        incoming=pre_unpool1,
        shape=(batch_size, conv1_filter_count) + (after_pool1, after_pool1),
    )

    unpool1 = our_layers.Unpool2DLayer(
        incoming=pre_unpool1,
        kernel_size=pool1_size,
    )

    deconv1 = layers.Conv2DLayer(
        incoming=unpool1,
        num_filters=3,
        filter_size=conv1_filter_size,
        stride=1,
        pad='same',
        nonlinearity=lasagne.nonlinearities.sigmoid,
    )

    output = layers.ReshapeLayer(incoming=deconv1, shape=input_var.shape)

    return (output, layers.get_output(output))
Ejemplo n.º 54
0
    def __init__(self,
                 feature_shape,
                 latent_size,
                 hidden_structure,
                 reconstruction_distribution=None,
                 number_of_reconstruction_classes=None,
                 use_count_sum=False,
                 use_batch_norm=False):

        self.use_count_sum = use_count_sum and \
            (reconstruction_distribution != "bernoulli")
        # Add warm-up to the model, weighting the KL-term gradually higher for each epoch

        self.use_batch_norm = use_batch_norm

        print("Setting up model.")
        print("    feature size: {}".format(feature_shape))
        print("    latent size: {}".format(latent_size))
        print("    hidden structure: {}".format(", ".join(
            map(str, hidden_structure))))
        if type(reconstruction_distribution) == str:
            print("    reconstruction distribution: " +
                  reconstruction_distribution)
        else:
            print("    reconstruction distribution: custom")
        if number_of_reconstruction_classes > 0:
            print(
                "    reconstruction classes: {}".format(
                    number_of_reconstruction_classes), " (including 0s)")
        if self.use_count_sum:
            print("    using count sums")
        if self.use_batch_norm:
            print("    using batch normalisation of each layer.")
        print("")

        # Setup

        super(VariationalAutoEncoderForCounts, self).__init__()

        self.feature_shape = feature_shape
        self.latent_size = latent_size
        self.hidden_structure = hidden_structure

        symbolic_x = T.matrix('x')  # counts
        symbolic_z = T.matrix('z')  # latent variable

        self.number_of_epochs_trained = 0

        symbolic_learning_rate = T.scalar("epsilon")
        symbolic_warm_up_weight = T.scalar("beta")

        self.learning_curves = {
            "training": {
                "LB": [],
                "ENRE": [],
                "KL": [],
                "KL_all": []
            },
            "validation": {
                "LB": [],
                "ENRE": [],
                "KL": []
            }
        }

        if reconstruction_distribution:

            if type(reconstruction_distribution) == str:
                if number_of_reconstruction_classes > 0:
                    reconstruction_distribution = "softmax_" + \
                        reconstruction_distribution
                    self.k_max = number_of_reconstruction_classes - 1
                    reconstruction_distribution = \
                        reconstruction_distributions[reconstruction_distribution]
                    reconstruction_distribution = \
                        reconstruction_distribution(self.k_max)
                else:
                    reconstruction_distribution = \
                        reconstruction_distributions[reconstruction_distribution]

            self.x_parameters = reconstruction_distribution["parameters"]
            self.reconstruction_activation_functions = \
                reconstruction_distribution["activation functions"]

            self.expectedNegativeReconstructionError = \
                reconstruction_distribution["function"]
            self.meanOfReconstructionDistribution = reconstruction_distribution[
                "mean"]
            self.preprocess = reconstruction_distribution["preprocess"]
        else:
            reconstruction_distribution = "Gaussian (default)"

            # Use a Gaussian distribution as standard
            self.x_parameters = ["mu", "sigma"]
            self.reconstruction_activation_functions = {
                "mu": identity,
                "sigma": identity
            }
            self.expectedNegativeReconstructionError = lambda x, x_theta, eps = 0.0: \
                log_normal(x, x_theta["mu"], x_theta["sigma"], eps)
            self.meanOfReconstructionDistribution = lambda x_theta: x_theta[
                "mu"]
            self.preprocess = lambda x: x

        # if number_of_reconstruction_classes > 0:
        #
        #     self.x_parameters += ["p_k"]
        #     self.reconstruction_activation_functions["p_k"] = softmax
        #     log_distribution = self.expectedNegativeReconstructionError
        #     self.expectedNegativeReconstructionError = lambda x, x_theta, eps = 0.0: \
        #         log_cross_entropy_extended(x, x_theta,
        #             log_distribution, k_max = number_of_reconstruction_classes - 1,
        #             eps = 0.0)
        #     mean_of_distribution = self.meanOfReconstructionDistribution
        #     self.meanOfReconstructionDistribution = lambda x_theta: \
        #         meanOfCrossEntropyExtendedDistibution(x_theta,
        #             mean_of_distribution, k_max = number_of_reconstruction_classes - 1)
        #     self.k_max = number_of_reconstruction_classes - 1

        if self.use_count_sum:
            symbolic_n = T.matrix('n')  # sum of counts

        # Models

        ## Recognition model q(z|x)

        l_enc_in = InputLayer(shape=(None, feature_shape), name="ENC_INPUT")
        l_enc = l_enc_in

        for i, hidden_size in enumerate(hidden_structure):
            l_enc = DenseLayer(l_enc,
                               num_units=hidden_size,
                               nonlinearity=rectify,
                               name='ENC_DENSE{:d}'.format(i + 1))
            if self.use_batch_norm:
                l_enc = batch_norm(l_enc)

        if self.use_batch_norm:
            l_z_mu = batch_norm(
                DenseLayer(l_enc,
                           num_units=latent_size,
                           nonlinearity=None,
                           name='ENC_Z_MU'))
            l_z_log_var = batch_norm(
                DenseLayer(l_enc,
                           num_units=latent_size,
                           nonlinearity=lambda x: T.clip(x, -10, 10),
                           name='ENC_Z_LOG_VAR'))
        else:
            l_z_mu = DenseLayer(l_enc,
                                num_units=latent_size,
                                nonlinearity=None,
                                name='ENC_Z_MU')
            l_z_log_var = DenseLayer(l_enc,
                                     num_units=latent_size,
                                     nonlinearity=lambda x: T.clip(x, -10, 10),
                                     name='ENC_Z_LOG_VAR')

        # Sample a latent representation z \sim q(z|x) = N(mu(x), logvar(x))
        l_z = SimpleSampleLayer(mean=l_z_mu,
                                log_var=l_z_log_var,
                                name="ENC_SAMPLE")

        self.encoder = l_z

        ## Generative model p(x|z)

        l_dec_z_in = InputLayer(shape=(None, latent_size), name="DEC_INPUT")

        if self.use_count_sum:
            l_dec_n_in = InputLayer(shape=(None, 1), name="DEC_N_INPUT")
            l_dec = ConcatLayer([l_dec_z_in, l_dec_n_in],
                                axis=1,
                                name="DEC_MERGE_INPUT")
        else:
            l_dec = l_dec_z_in

        for i, hidden_size in enumerate(reversed(hidden_structure)):
            if self.use_batch_norm:
                l_dec = batch_norm(
                    DenseLayer(
                        l_dec,
                        num_units=hidden_size,
                        nonlinearity=rectify,
                        name='DEC_DENSE{:d}'.format(len(hidden_structure) -
                                                    i)))
            else:
                l_dec = DenseLayer(
                    l_dec,
                    num_units=hidden_size,
                    nonlinearity=rectify,
                    name='DEC_DENSE{:d}'.format(len(hidden_structure) - i))

        l_x_theta = {}

        for p in self.x_parameters:
            p_name = 'DEC_X_' + p.upper()
            if self.reconstruction_activation_functions[p] == softmax:
                if self.use_batch_norm:
                    l_dense = batch_norm(
                        DenseLayer(l_dec,
                                   num_units=feature_shape * (self.k_max + 1),
                                   nonlinearity=identity,
                                   name=p_name + "_DENSE"))
                else:
                    l_dense = DenseLayer(l_dec,
                                         num_units=feature_shape *
                                         (self.k_max + 1),
                                         nonlinearity=identity,
                                         name=p_name + "_DENSE")

                l_reshape = ReshapeLayer(l_dense, (-1, (self.k_max + 1)))

                if self.use_batch_norm:
                    l_softmax = batch_norm(
                        DenseLayer(l_reshape,
                                   num_units=(self.k_max + 1),
                                   nonlinearity=softmax,
                                   name=p_name + "_SOFTMAX"))
                else:
                    l_softmax = DenseLayer(l_reshape,
                                           num_units=(self.k_max + 1),
                                           nonlinearity=softmax,
                                           name=p_name + "_SOFTMAX")

                l_x_theta[p] = ReshapeLayer(l_softmax, (-1, feature_shape,
                                                        (self.k_max + 1)))
            else:
                if self.use_batch_norm:
                    l_x_theta[p] = batch_norm(
                        DenseLayer(l_dec,
                                   num_units=feature_shape,
                                   nonlinearity=self.
                                   reconstruction_activation_functions[p],
                                   name=p_name))
                else:
                    l_x_theta[p] = DenseLayer(
                        l_dec,
                        num_units=feature_shape,
                        nonlinearity=self.
                        reconstruction_activation_functions[p],
                        name=p_name)

        self.decoder = {p: l_x_theta[p] for p in self.x_parameters}

        ## Get outputs from models

        ## Training outputs
        z_train, z_mu_train, z_log_var_train = get_output(
            [l_z, l_z_mu, l_z_log_var], {l_enc_in: symbolic_x},
            deterministic=False)
        inputs = {l_dec_z_in: z_train}
        if self.use_count_sum:
            inputs[l_dec_n_in] = symbolic_n
        x_theta_train = get_output([l_x_theta[p] for p in self.x_parameters],
                                   inputs,
                                   deterministic=False)
        x_theta_train = {
            p: o
            for p, o in zip(self.x_parameters, x_theta_train)
        }

        ## Evaluation outputs
        z_eval, z_mu_eval, z_log_var_eval = get_output(
            [l_z, l_z_mu, l_z_log_var], {l_enc_in: symbolic_x},
            deterministic=True)
        inputs = {l_dec_z_in: z_eval}
        if self.use_count_sum:
            inputs[l_dec_n_in] = symbolic_n
        x_theta_eval = get_output([l_x_theta[p] for p in self.x_parameters],
                                  inputs,
                                  deterministic=True)
        x_theta_eval = {p: o for p, o in zip(self.x_parameters, x_theta_eval)}

        ## Sample outputs

        inputs = {l_dec_z_in: symbolic_z}
        if self.use_count_sum:
            inputs[l_dec_n_in] = symbolic_n
        x_theta_sample = get_output([l_x_theta[p] for p in self.x_parameters],
                                    inputs,
                                    deterministic=True)
        x_theta_sample = {
            p: o
            for p, o in zip(self.x_parameters, x_theta_sample)
        }

        # Likelihood

        lower_bound_train, log_p_x_train, KL__train, KL__train_all = \
            self.lowerBound(symbolic_x, x_theta_train, z_mu_train, z_log_var_train, beta=symbolic_warm_up_weight)
        lower_bound_eval, log_p_x_eval, KL__eval, KL__eval_all = \
            self.lowerBound(symbolic_x, x_theta_eval, z_mu_eval, z_log_var_eval)

        all_parameters = get_all_params(
            [l_z] + [l_x_theta[p] for p in self.x_parameters], trainable=True)

        print("Parameters to train:")
        for parameter in all_parameters:
            print("    {}: {}".format(parameter, parameter.get_value().shape))

        # Let Theano do its magic and get all the gradients we need for training
        all_gradients = T.grad(-lower_bound_train, all_parameters)

        # Set the update function for parameters. The Adam optimizer works really well with VAEs.
        update_expressions = updates.adam(all_gradients,
                                          all_parameters,
                                          learning_rate=symbolic_learning_rate)

        inputs = [symbolic_x]
        if self.use_count_sum:
            inputs.append(symbolic_n)
        inputs.append(symbolic_learning_rate)
        inputs.append(symbolic_warm_up_weight)

        self.f_train = theano.function(inputs=inputs,
                                       outputs=[
                                           lower_bound_train, log_p_x_train,
                                           KL__train, KL__train_all
                                       ],
                                       updates=update_expressions)

        inputs = [symbolic_x]
        if self.use_count_sum:
            inputs.append(symbolic_n)

        self.f_eval = theano.function(
            inputs=inputs,
            outputs=[lower_bound_eval, log_p_x_eval, KL__eval, KL__eval_all])

        self.f_z = theano.function(inputs=[symbolic_x], outputs=[z_eval])

        inputs = [symbolic_z]
        if self.use_count_sum:
            inputs.append(symbolic_n)

        self.f_sample = theano.function(
            inputs=inputs,
            outputs=[x_theta_sample[p] for p in self.x_parameters])

        inputs = [symbolic_x]
        if self.use_count_sum:
            inputs.append(symbolic_n)

        self.f_recon = theano.function(
            inputs=inputs,
            outputs=[x_theta_eval[p] for p in self.x_parameters])
 def loss(test=False):
     return lasagne.objectives.categorical_crossentropy(
         get_output(network, deterministic=test), target_var).mean()
Ejemplo n.º 56
0
def train_model(learning_rate=0.0009, n_epochs=50, batch_size=200):
    '''
            Function that compute the training of the model
            '''

    #######################
    # Loading the dataset #
    #######################

    print ('... Loading data')

    # Load the dataset on the CPU
    data_path = get_path()
    train_input_path = 'train_input_'
    train_target_path = 'train_target_'
    valid_input_path = 'valid_input_'
    valid_target_path = 'valid_target_'
    nb_train_batch = 9
    nb_valid_batch = 5

    # Creating symbolic variables
    batch = 200
    max_size = 25
    min_train_size = 13
    min_valid_size = 2
    input_channel = 3
    max_height = 64
    max_width = 64
    min_height = 32
    min_width = 32
    # Shape = (5000, 3, 64, 64)
    big_train_input = shared_GPU_data(shape=(batch * max_size, input_channel, max_height, max_width))
    big_valid_input = shared_GPU_data(shape=(batch * max_size, input_channel, max_height, max_width))
    # Shape = (5000, 3, 32, 32)
    big_train_target = shared_GPU_data(shape=(batch * max_size, input_channel, min_height, min_width))
    big_valid_target = shared_GPU_data(shape=(batch * max_size, input_channel, min_height, min_width))
    # Shape = (2600, 3, 64, 64)
    small_train_input = shared_GPU_data(shape=(batch * min_train_size, input_channel, max_height, max_width))
    # Shape = (2600, 3, 32, 32)
    small_train_target = shared_GPU_data(shape=(batch * min_train_size, input_channel, min_height, min_width))
    # Shape = (400, 3, 64, 64)
    small_valid_input = shared_GPU_data(shape=(batch * min_valid_size, input_channel, max_height, max_width))
    # Shape = (400, 3, 32, 32)
    small_valid_target = shared_GPU_data(shape=(batch * min_valid_size, input_channel, min_height, min_width))

    ######################
    # Building the model #
    ######################

    # Symbolic variables
    x = T.tensor4('x', dtype=theano.config.floatX)
    y = T.tensor4('y', dtype=theano.config.floatX)
    index = T.lscalar()

    # Creation of the model
    model = build_model2(input_var=x)
    output = layers.get_output(model, deterministic=True)
    params = layers.get_all_params(model, trainable=True)
    loss = T.mean(objectives.squared_error(output, y))
    updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate)

    # Creation of theano functions
    train_big_model = theano.function([index], loss, updates=updates, allow_input_downcast=True,
                                      givens={x: big_train_input[index * batch_size: (index + 1) * batch_size],
                                              y: big_train_target[index * batch_size: (index + 1) * batch_size]})

    train_small_model = theano.function([index], loss, updates=updates, allow_input_downcast=True,
                                        givens={x: small_train_input[index * batch_size: (index + 1) * batch_size],
                                                y: small_train_target[index * batch_size: (index + 1) * batch_size]})

    big_valid_loss = theano.function([index], loss, allow_input_downcast=True,
                                     givens={x: big_valid_input[index * batch_size: (index + 1) * batch_size],
                                             y: big_valid_target[index * batch_size: (index + 1) * batch_size]})

    small_valid_loss = theano.function([index], loss, allow_input_downcast=True,
                                       givens={x: small_valid_input[index * batch_size: (index + 1) * batch_size],
                                               y: small_valid_target[index * batch_size: (index + 1) * batch_size]})

    idx = 50  # idx = index in this case
    pred_batch = 5
    predict_target = theano.function([index], output, allow_input_downcast=True,
                                     givens={x: small_valid_input[index * pred_batch: (index + 1) * pred_batch]})

    ###################
    # Train the model #
    ###################

    print('... Training')

    best_validation_loss = np.inf
    best_iter = 0
    epoch = 0

    # Valid images chosen when a better model is found
    batch_verification = 0
    num_images = range(idx * pred_batch, (idx + 1) * pred_batch)

    start_time = timeit.default_timer()

    while (epoch < n_epochs):
        epoch = epoch + 1
        n_train_batches = 0
        for i in range(nb_train_batch):
            if i == (nb_train_batch - 1):
                # Shape = (2600, 3, 64, 64) & Shape = (2600, 3, 32, 32)
                input, target = get_image(data_path, train_input_path, train_target_path, str(i))
                small_train_input.set_value(input)
                small_train_target.set_value(target)
                for j in range(min_train_size):
                    cost = train_small_model(j)
                    n_train_batches += 1
            else:
                # Shape = (10000, 3, 64, 64) & Shape = (10000, 3, 32, 32)
                input, target = get_image(data_path, train_input_path, train_target_path, str(i))
                big_train_input.set_value(input[0: batch * max_size])
                big_train_target.set_value(target[0: batch * max_size])
                for j in range(max_size):
                    cost = train_big_model(j)
                    n_train_batches += 1
                big_train_input.set_value(input[batch * max_size:])
                big_train_target.set_value(target[batch * max_size:])
                for j in range(max_size):
                    cost = train_big_model(j)
                    n_train_batches += 1

        validation_losses = []
        for i in range(nb_valid_batch):
            if i == (nb_valid_batch - 1):
                # Shape = (400, 3, 64, 64) & Shape = (400, 3, 32, 32)
                input, target = get_image(data_path, valid_input_path, valid_target_path, str(i))
                small_valid_input.set_value(input)
                small_valid_target.set_value(target)
                for j in range(min_valid_size):
                    validation_losses.append(small_valid_loss(j))
            else:
                # Shape = (10000, 3, 64, 64) & Shape = (10000, 3, 32, 32)
                input, target = get_image(data_path, valid_input_path, valid_target_path, str(i))
                big_valid_input.set_value(input[0: batch * max_size])
                big_valid_target.set_value(target[0: batch * max_size])
                for j in range(max_size):
                    validation_losses.append(big_valid_loss(j))
                big_valid_input.set_value(input[batch * max_size:])
                big_valid_target.set_value(target[batch * max_size:])
                for j in range(max_size):
                    validation_losses.append(big_valid_loss(j))

        this_validation_loss = np.mean(validation_losses)

        print('epoch %i, minibatch %i/%i, validation error %f %%' %
              (epoch, n_train_batches, n_train_batches, this_validation_loss * 100.))

        # if we got the best validation score until now
        if this_validation_loss < best_validation_loss:
            # save best validation score and iteration number
            best_validation_loss = this_validation_loss
            best_iter = epoch

            # save the model and a bunch of valid pictures
            print ('... saving model and valid images')

            np.savez('best_cnn_model.npz', *layers.get_all_param_values(model))
            # Shape = (10000, 3, 64, 64) & Shape = (10000, 3, 32, 32)
            input, target = get_image(data_path, valid_input_path, valid_target_path, str(batch_verification))
            small_valid_input.set_value(input[0: batch * min_valid_size])
            input = input[num_images]
            target = target[num_images]
            output = predict_target(idx)
            save_images(input=input, target=target, output=output, nbr_images=len(num_images), iteration=epoch)

    end_time = timeit.default_timer()

    print('Optimization complete.')
    print('Best validation score of %f %% obtained at epoch %i' %
          (best_validation_loss * 100., best_iter))
    print('The code ran for %.2fm' % ((end_time - start_time) / 60.))
Ejemplo n.º 57
0
def run_task(vv, log_dir=None, exp_name=None):
    global policy
    global baseline

    trpo_stepsize = 0.01
    trpo_subsample_factor = 0.2

    # Check if variant is available
    if vv['model_type'] not in ['BrushTireModel', 'LinearTireModel']:
        raise ValueError('Unrecognized model type for simulating robot')
    if vv['robot_type'] not in ['MRZR', 'RCCar']:
        raise ValueError('Unrecognized robot type')

    # Load environment
    if not vv['use_ros']:
        env = StraightEnv(target_velocity=vv['target_velocity'],
                          dt=vv['dt'],
                          model_type=vv['model_type'],
                          robot_type=vv['robot_type'],
                          mu_s=vv['mu_s'],
                          mu_k=vv['mu_k'])
    else:
        from aa_simulation.envs.straight.straight_env_ros import StraightEnvROS
        env = StraightEnvROS(target_velocity=vv['target_velocity'],
                             dt=vv['dt'],
                             model_type=vv['model_type'],
                             robot_type=vv['robot_type'])

    # Save variant information for comparison plots
    variant_file = logger.get_snapshot_dir() + '/variant.json'
    logger.log_variant(variant_file, vv)

    # Set variance for each action component separately for exploration
    # Note: We set the variance manually because we are not scaling our
    #       action space during training.
    init_std_speed = vv['target_velocity'] / 4
    init_std_steer = np.pi / 6
    init_std = [init_std_speed, init_std_steer]

    # Build policy and baseline networks
    # Note: Mean of policy network set to analytically computed values for
    #       faster training (rough estimates for RL to fine-tune).
    if policy is None or baseline is None:
        target_velocity = vv['target_velocity']
        target_steering = 0
        output_mean = np.array([target_velocity, target_steering])
        hidden_sizes = (32, 32)

        # In mean network, allow output b values to dominate final output
        # value by constraining the magnitude of the output W matrix. This is
        # to allow faster learning. These numbers are arbitrarily chosen.
        W_gain = min(vv['target_velocity'] / 5, np.pi / 15)

        mean_network = MLP(input_shape=(env.spec.observation_space.flat_dim, ),
                           output_dim=env.spec.action_space.flat_dim,
                           hidden_sizes=hidden_sizes,
                           hidden_nonlinearity=LN.tanh,
                           output_nonlinearity=None,
                           output_W_init=LI.GlorotUniform(gain=W_gain),
                           output_b_init=output_mean)
        policy = GaussianMLPPolicy(env_spec=env.spec,
                                   hidden_sizes=(32, 32),
                                   init_std=init_std,
                                   mean_network=mean_network)
        baseline = LinearFeatureBaseline(env_spec=env.spec,
                                         target_key='returns')

    # Reset variance to re-enable exploration when using pre-trained networks
    else:
        policy._l_log_std = ParamLayer(
            policy._mean_network.input_layer,
            num_units=env.spec.action_space.flat_dim,
            param=LI.Constant(np.log(init_std)),
            name='output_log_std',
            trainable=True)
        obs_var = policy._mean_network.input_layer.input_var
        mean_var, log_std_var = L.get_output(
            [policy._l_mean, policy._l_log_std])
        policy._log_std_var = log_std_var
        LasagnePowered.__init__(policy, [policy._l_mean, policy._l_log_std])
        policy._f_dist = ext.compile_function(inputs=[obs_var],
                                              outputs=[mean_var, log_std_var])

    safety_baseline = LinearFeatureBaseline(env_spec=env.spec,
                                            target_key='safety_returns')

    safety_constraint = StraightSafetyConstraint(max_value=1.0,
                                                 baseline=safety_baseline)

    if vv['algo'] == 'TRPO':
        algo = TRPO(
            env=env,
            policy=policy,
            baseline=baseline,
            batch_size=600,
            max_path_length=env.horizon,
            n_itr=600,
            discount=0.99,
            step_size=trpo_stepsize,
            plot=False,
        )
    else:
        algo = CPO(env=env,
                   policy=policy,
                   baseline=baseline,
                   safety_constraint=safety_constraint,
                   batch_size=600,
                   max_path_length=env.horizon,
                   n_itr=600,
                   discount=0.99,
                   step_size=trpo_stepsize,
                   gae_lambda=0.95,
                   safety_gae_lambda=1,
                   optimizer_args={'subsample_factor': trpo_subsample_factor},
                   plot=False)
    algo.train()
Ejemplo n.º 58
0
    def __init__(self, stories, QAs, batch_size, story_v, learning_rate,
                 word_vector_size, sent_vector_size, dim, mode, answer_module,
                 input_mask_mode, memory_hops, l2, story_source,
                 normalize_attention, batch_norm, dropout, dropout_in,
                 **kwargs):

        #print "==> not used params in DMN class:", kwargs.keys()
        self.learning_rate = learning_rate
        self.rng = np.random
        self.rng.seed(1234)
        mqa = MovieQA.DataLoader()
        ### Load Word2Vec model
        w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin')
        self.w2v = w2v_model
        self.d_w2v = len(w2v_model.get_vector(w2v_model.vocab[1]))
        self.word_thresh = 1
        print "Loaded word2vec model: dim = %d | vocab-size = %d" % (
            self.d_w2v, len(w2v_model.vocab))
        ### Create vocabulary-to-index and index-to-vocabulary
        v2i = {'': 0, 'UNK': 1}  # vocabulary to index
        QA_words, v2i = self.create_vocabulary(
            QAs,
            stories,
            v2i,
            w2v_vocab=w2v_model.vocab.tolist(),
            word_thresh=self.word_thresh)
        i2v = {v: k for k, v in v2i.iteritems()}
        self.vocab = v2i
        self.ivocab = i2v
        self.story_v = story_v
        self.word2vec = w2v_model
        self.word_vector_size = word_vector_size
        self.sent_vector_size = sent_vector_size
        self.dim = dim
        self.batch_size = batch_size
        self.mode = mode
        self.answer_module = answer_module
        self.input_mask_mode = input_mask_mode
        self.memory_hops = memory_hops
        self.l2 = l2
        self.normalize_attention = normalize_attention
        self.batch_norm = batch_norm
        self.dropout = dropout
        self.dropout_in = dropout_in

        #self.max_inp_sent_len = 0
        #self.max_q_len = 0

        ### Convert QAs and stories into numpy matrices (like in the bAbI data set)
        # storyM - Dictionary - indexed by imdb_key. Values are [num-sentence X max-num-words]
        # questionM - NP array - [num-question X max-num-words]
        # answerM - NP array - [num-question X num-answer-options X max-num-words]
        storyM, questionM, answerM = self.data_in_matrix_form(
            stories, QA_words, v2i)
        qinfo = self.associate_additional_QA_info(QAs)

        ### Split everything into train, val, and test data
        train_storyM = {
            k: v
            for k, v in storyM.iteritems() if k in mqa.data_split['train']
        }
        val_storyM = {
            k: v
            for k, v in storyM.iteritems() if k in mqa.data_split['val']
        }
        test_storyM = {
            k: v
            for k, v in storyM.iteritems() if k in mqa.data_split['test']
        }

        def split_train_test(long_list, QAs, trnkey='train', tstkey='val'):
            # Create train/val/test splits based on key
            train_split = [
                item for k, item in enumerate(long_list)
                if QAs[k].qid.startswith('train')
            ]
            val_split = [
                item for k, item in enumerate(long_list)
                if QAs[k].qid.startswith('val')
            ]
            test_split = [
                item for k, item in enumerate(long_list)
                if QAs[k].qid.startswith('test')
            ]
            if type(long_list) == np.ndarray:
                return np.array(train_split), np.array(val_split), np.array(
                    test_split)
            else:
                return train_split, val_split, test_split

        train_questionM, val_questionM, test_questionM = split_train_test(
            questionM, QAs)
        train_answerM, val_answerM, test_answerM, = split_train_test(
            answerM, QAs)
        train_qinfo, val_qinfo, test_qinfo = split_train_test(qinfo, QAs)

        QA_train = [qa for qa in QAs if qa.qid.startswith('train:')]
        QA_val = [qa for qa in QAs if qa.qid.startswith('val:')]
        QA_test = [qa for qa in QAs if qa.qid.startswith('test:')]

        #train_data = {'s':train_storyM, 'q':train_questionM, 'a':train_answerM, 'qinfo':train_qinfo}
        #val_data =   {'s':val_storyM,   'q':val_questionM,   'a':val_answerM,   'qinfo':val_qinfo}
        #test_data  = {'s':test_storyM,  'q':test_questionM,  'a':test_answerM,  'qinfo':test_qinfo}

        with open('train_split.json') as fid:
            trdev = json.load(fid)

        s_key = self.story_v.keys()
        self.train_range = [
            k for k, qi in enumerate(qinfo)
            if (qi['movie'] in trdev['train'] and qi['qid'] in s_key)
        ]
        self.train_val_range = [
            k for k, qi in enumerate(qinfo)
            if (qi['movie'] in trdev['dev'] and qi['qid'] in s_key)
        ]
        self.val_range = [
            k for k, qi in enumerate(val_qinfo) if qi['qid'] in s_key
        ]

        self.max_sent_len = max(
            [sty.shape[0] for sty in self.story_v.values()])
        self.train_input = self.story_v
        self.train_val_input = self.story_v
        self.test_input = self.story_v
        self.train_q = train_questionM
        self.train_answer = train_answerM
        self.train_qinfo = train_qinfo
        self.train_val_q = train_questionM
        self.train_val_answer = train_answerM
        self.train_val_qinfo = train_qinfo
        self.test_q = val_questionM
        self.test_answer = val_answerM
        self.test_qinfo = val_qinfo
        """Setup some configuration parts of the model.
        """
        self.v2i = v2i
        self.vs = len(v2i)
        self.d_lproj = 300

        # define Look-Up-Table mask
        np_mask = np.vstack(
            (np.zeros(self.d_w2v), np.ones((self.vs - 1, self.d_w2v))))
        T_mask = theano.shared(np_mask.astype(theano.config.floatX),
                               name='LUT_mask')

        # setup Look-Up-Table to be Word2Vec
        self.pca_mat = None
        print "Initialize LUTs as word2vec and use linear projection layer"

        self.LUT = np.zeros((self.vs, self.d_w2v), dtype='float32')
        found_words = 0
        for w, v in self.v2i.iteritems():
            if w in self.w2v.vocab:  # all valid words are already in vocab or 'UNK'
                self.LUT[v] = self.w2v.get_vector(w)
                found_words += 1
            else:
                # LUT[v] = np.zeros((self.d_w2v))
                self.LUT[v] = self.rng.randn(self.d_w2v)
                self.LUT[v] = self.LUT[v] / (np.linalg.norm(self.LUT[v]) +
                                             1e-6)

        print "Found %d / %d words" % (found_words, len(self.v2i))

        # word 0 is blanked out, word 1 is 'UNK'
        self.LUT[0] = np.zeros((self.d_w2v))

        # if linear projection layer is not the same shape as LUT, then initialize with PCA
        if self.d_lproj != self.LUT.shape[1]:
            pca = PCA(n_components=self.d_lproj, whiten=True)
            self.pca_mat = pca.fit_transform(self.LUT.T)  # 300 x 100?

        # setup LUT!
        self.T_w2v = theano.shared(self.LUT.astype(theano.config.floatX))

        self.train_input_mask = np_mask
        self.test_input_mask = np_mask
        #self.train_input, self.train_q, self.train_answer, self.train_input_mask = self._process_input(babi_train_raw)
        #self.test_input, self.test_q, self.test_answer, self.test_input_mask = self._process_input(babi_test_raw)
        self.vocab_size = len(self.vocab)

        self.input_var = T.tensor3('input_var')
        self.q_var = T.matrix('question_var')
        self.answer_var = T.tensor3('answer_var')
        self.input_mask_var = T.imatrix('input_mask_var')
        self.target = T.ivector('target')
        self.attentions = []

        #self.pe_matrix_in = self.pe_matrix(self.max_inp_sent_len)
        #self.pe_matrix_q = self.pe_matrix(self.max_q_len)

        print "==> building input module"

        #positional encoder weights
        self.W_pe = nn_utils.normal_param(std=0.1,
                                          shape=(self.vocab_size, self.dim))

        #biGRU input fusion weights
        self.W_inp_res_in_fwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_res_hid_fwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_res_fwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_upd_in_fwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_upd_hid_fwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_upd_fwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_hid_in_fwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_hid_hid_fwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_hid_fwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_res_in_bwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_res_hid_bwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_res_bwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_upd_in_bwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_upd_hid_bwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_upd_bwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        self.W_inp_hid_in_bwd = nn_utils.normal_param(
            std=0.1, shape=(self.dim, self.sent_vector_size))
        self.W_inp_hid_hid_bwd = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
        self.b_inp_hid_bwd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

        #self.V_f = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #self.V_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))

        self.inp_sent_reps = self.input_var

        #self.inp_sent_reps_stacked = T.stacklists(self.inp_sent_reps)
        #self.inp_c = self.input_module_full(self.inp_sent_reps_stacked)
        self.ans_reps = self.answer_var
        self.inp_c = self.input_module_full(self.inp_sent_reps)

        self.q_q = self.q_var

        print "==> creating parameters for memory module"
        self.W_mem_res_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.memory_hops,
                                                         self.dim, self.dim))
        self.W_mem_res_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.memory_hops,
                                                          self.dim, self.dim))
        self.b_mem_res = nn_utils.constant_param(value=0.0,
                                                 shape=(
                                                     self.memory_hops,
                                                     self.dim,
                                                 ))

        self.W_mem_upd_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.memory_hops,
                                                         self.dim, self.dim))
        self.W_mem_upd_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.memory_hops,
                                                          self.dim, self.dim))
        self.b_mem_upd = nn_utils.constant_param(value=0.0,
                                                 shape=(
                                                     self.memory_hops,
                                                     self.dim,
                                                 ))

        self.W_mem_hid_in = nn_utils.normal_param(std=0.1,
                                                  shape=(self.memory_hops,
                                                         self.dim, self.dim))
        self.W_mem_hid_hid = nn_utils.normal_param(std=0.1,
                                                   shape=(self.memory_hops,
                                                          self.dim, self.dim))
        self.b_mem_hid = nn_utils.constant_param(value=0.0,
                                                 shape=(
                                                     self.memory_hops,
                                                     self.dim,
                                                 ))

        #self.W_b = nn_utils.normal_param(std=0.1, shape=(self.dim, self.dim))
        #self.W_1 = nn_utils.normal_param(std=0.1, shape=(self.dim, 7 * self.dim + 0))
        self.W_1 = nn_utils.normal_param(std=0.1,
                                         shape=(self.memory_hops, self.dim,
                                                4 * self.dim + 0))
        self.W_2 = nn_utils.normal_param(std=0.1,
                                         shape=(self.memory_hops, 1, self.dim))
        self.b_1 = nn_utils.constant_param(value=0.0,
                                           shape=(
                                               self.memory_hops,
                                               self.dim,
                                           ))
        self.b_2 = nn_utils.constant_param(value=0.0,
                                           shape=(
                                               self.memory_hops,
                                               1,
                                           ))

        print "==> building episodic memory module (fixed number of steps: %d)" % self.memory_hops
        memory = [self.q_q.copy()]
        for iter in range(1, self.memory_hops + 1):
            self.mem_weight_num = int(iter - 1)
            current_episode = self.new_episode(memory[iter - 1])
            memory.append(
                self.GRU_update(memory[iter - 1], current_episode,
                                self.W_mem_res_in[self.mem_weight_num],
                                self.W_mem_res_hid[self.mem_weight_num],
                                self.b_mem_res[self.mem_weight_num],
                                self.W_mem_upd_in[self.mem_weight_num],
                                self.W_mem_upd_hid[self.mem_weight_num],
                                self.b_mem_upd[self.mem_weight_num],
                                self.W_mem_hid_in[self.mem_weight_num],
                                self.W_mem_hid_hid[self.mem_weight_num],
                                self.b_mem_hid[self.mem_weight_num]))

        last_mem_raw = memory[-1]

        net = layers.InputLayer(shape=(self.batch_size, self.dim),
                                input_var=last_mem_raw)
        if self.dropout > 0 and self.mode == 'train':
            net = layers.DropoutLayer(net, p=self.dropout)
        last_mem = layers.get_output(net)[0]

        print "==> building answer module"
        self.W_a = nn_utils.normal_param(std=0.1, shape=(300, self.dim))

        if self.answer_module == 'feedforward':
            self.temp = T.dot(self.ans_reps, self.W_a)
            self.prediction = nn_utils.softmax(T.dot(self.temp, last_mem))

        elif self.answer_module == 'recurrent':
            self.W_ans_res_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_res_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_res = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_upd_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_upd_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_upd = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            self.W_ans_hid_in = nn_utils.normal_param(
                std=0.1, shape=(self.dim, self.dim + self.vocab_size))
            self.W_ans_hid_hid = nn_utils.normal_param(std=0.1,
                                                       shape=(self.dim,
                                                              self.dim))
            self.b_ans_hid = nn_utils.constant_param(value=0.0,
                                                     shape=(self.dim, ))

            def answer_step(prev_a, prev_y):
                a = self.GRU_update(prev_a, T.concatenate([prev_y, self.q_q]),
                                    self.W_ans_res_in, self.W_ans_res_hid,
                                    self.b_ans_res, self.W_ans_upd_in,
                                    self.W_ans_upd_hid, self.b_ans_upd,
                                    self.W_ans_hid_in, self.W_ans_hid_hid,
                                    self.b_ans_hid)

                y = nn_utils.softmax(T.dot(self.W_a, a))
                return [a, y]

            # add conditional ending?
            dummy = theano.shared(np.zeros((self.vocab_size, ), dtype=floatX))

            results, updates = theano.scan(
                fn=answer_step,
                outputs_info=[last_mem, T.zeros_like(dummy)],
                n_steps=1)
            self.prediction = results[1][-1]

        else:
            raise Exception("invalid answer_module")

        print "==> collecting all parameters"
        self.params = [
            self.W_pe,
            self.W_inp_res_in_fwd,
            self.W_inp_res_hid_fwd,
            self.b_inp_res_fwd,
            self.W_inp_upd_in_fwd,
            self.W_inp_upd_hid_fwd,
            self.b_inp_upd_fwd,
            self.W_inp_hid_in_fwd,
            self.W_inp_hid_hid_fwd,
            self.b_inp_hid_fwd,
            self.W_inp_res_in_bwd,
            self.W_inp_res_hid_bwd,
            self.b_inp_res_bwd,
            self.W_inp_upd_in_bwd,
            self.W_inp_upd_hid_bwd,
            self.b_inp_upd_bwd,
            self.W_inp_hid_in_bwd,
            self.W_inp_hid_hid_bwd,
            self.b_inp_hid_bwd,
            self.W_mem_res_in,
            self.W_mem_res_hid,
            self.b_mem_res,
            self.W_mem_upd_in,
            self.W_mem_upd_hid,
            self.b_mem_upd,
            self.W_mem_hid_in,
            self.W_mem_hid_hid,
            self.b_mem_hid,  #self.W_b
            self.W_1,
            self.W_2,
            self.b_1,
            self.b_2,
            self.W_a
        ]

        if self.answer_module == 'recurrent':
            self.params = self.params + [
                self.W_ans_res_in, self.W_ans_res_hid, self.b_ans_res,
                self.W_ans_upd_in, self.W_ans_upd_hid, self.b_ans_upd,
                self.W_ans_hid_in, self.W_ans_hid_hid, self.b_ans_hid
            ]

        print "==> building loss layer and computing updates"
        self.loss_ce = T.nnet.categorical_crossentropy(self.prediction,
                                                       self.target)

        if self.l2 > 0:
            self.loss_l2 = self.l2 * nn_utils.l2_reg(self.params)
        else:
            self.loss_l2 = 0

        self.loss = T.mean(self.loss_ce) + self.loss_l2

        #updates = lasagne.updates.adadelta(self.loss, self.params)
        #updates = lasagne.updates.adam(self.loss, self.params)
        updates = lasagne.updates.adam(self.loss,
                                       self.params,
                                       learning_rate=self.learning_rate,
                                       beta1=0.5)  #from DCGAN paper
        #updates = lasagne.updates.adadelta(self.loss, self.params, learning_rate=0.0005)
        #updates = lasagne.updates.momentum(self.loss, self.params, learning_rate=0.0003)

        self.attentions = T.stack(self.attentions)
        if self.mode == 'train':
            print "==> compiling train_fn"
            self.train_fn = theano.function(
                inputs=[
                    self.input_var, self.q_var, self.answer_var, self.target
                ],
                outputs=[self.prediction, self.loss, self.attentions],
                updates=updates,
                on_unused_input='warn',
                allow_input_downcast=True)

        print "==> compiling test_fn"
        self.test_fn = theano.function(
            inputs=[self.input_var, self.q_var, self.answer_var, self.target],
            outputs=[self.prediction, self.loss, self.attentions],
            on_unused_input='warn',
            allow_input_downcast=True)
Ejemplo n.º 59
0
                                   nonlinearity=nn.relu),
                  g=None))  # 4 -> 8
gen_layers.append(
    nn.batch_norm(nn.Deconv2DLayer(gen_layers[-1],
                                   (args.batch_size, 128, 16, 16), (5, 5),
                                   W=Normal(0.05),
                                   nonlinearity=nn.relu),
                  g=None))  # 8 -> 16
gen_layers.append(
    nn.weight_norm(nn.Deconv2DLayer(gen_layers[-1],
                                    (args.batch_size, 3, 32, 32), (5, 5),
                                    W=Normal(0.05),
                                    nonlinearity=T.tanh),
                   train_g=True,
                   init_stdv=0.1))  # 16 -> 32
gen_dat = ll.get_output(gen_layers[-1])

# specify discriminative model
disc_layers = [ll.InputLayer(shape=(None, 3, 32, 32))]
disc_layers.append(ll.DropoutLayer(disc_layers[-1], p=0.2))
disc_layers.append(
    nn.weight_norm(
        dnn.Conv2DDNNLayer(disc_layers[-1],
                           96, (3, 3),
                           pad=1,
                           W=Normal(0.05),
                           nonlinearity=nn.lrelu)))
disc_layers.append(
    nn.weight_norm(
        dnn.Conv2DDNNLayer(disc_layers[-1],
                           96, (3, 3),
Ejemplo n.º 60
0
    def __init__(
            self,
            name,
            input_shape,
            output_dim,
            hidden_sizes,
            conv_filters,conv_filter_sizes,conv_strides,conv_pads,
            hidden_nonlinearity=NL.rectify,
            mean_network=None,

            optimizer=None,
            use_trust_region=True,
            step_size=0.01,
            subsample_factor=1.0,
            batchsize=None,

            learn_std=True,
            init_std=1.0,
            adaptive_std=False,
            std_share_network=False,
            std_conv_filters=[],std_conv_filters_sizes=[],std_conv_strides=[],std_conv_pads=[],
            std_hidden_sizes=(32, 32),
            std_nonlinearity=None,
            normalize_inputs=True,
            normalize_outputs=True,
    ):
        """
        :param input_shape: usually for images of the form (width,height,channel)
        :param output_dim: Dimension of output.
        :param hidden_sizes: Number of hidden units of each layer of the mean network.
        :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
        :param optimizer: Optimizer for minimizing the negative log-likelihood.
        :param use_trust_region: Whether to use trust region constraint.
        :param step_size: KL divergence constraint for each iteration
        :param learn_std: Whether to learn the standard deviations. Only effective if adaptive_std is False. If
        adaptive_std is True, this parameter is ignored, and the weights for the std network are always learned.
        :param adaptive_std: Whether to make the std a function of the states.
        :param std_share_network: Whether to use the same network as the mean.
        :param std_hidden_sizes: Number of hidden units of each layer of the std network. Only used if
        `std_share_network` is False. It defaults to the same architecture as the mean.
        :param std_nonlinearity: Non-linearity used for each layer of the std network. Only used if `std_share_network`
        is False. It defaults to the same non-linearity as the mean.
        """
        Serializable.quick_init(self, locals())


        if optimizer is None:
            if use_trust_region:
                optimizer = PenaltyLbfgsOptimizer("optimizer")
            else:
                optimizer = LbfgsOptimizer("optimizer")

        self._optimizer = optimizer

        self.input_shape = input_shape
        if mean_network is None:
            mean_network = ConvNetwork(
                name="mean_network",
                input_shape=input_shape,
                output_dim=output_dim,
                conv_filters=conv_filters,
                conv_filter_sizes=conv_filter_sizes,
                conv_strides=conv_strides,
                conv_pads=conv_pads,
                hidden_sizes=hidden_sizes,
                hidden_nonlinearity=hidden_nonlinearity,
                output_nonlinearity=None,
            )

        l_mean = mean_network.output_layer

        if adaptive_std:
            l_log_std = ConvNetwork(
                name="log_std_network",
                input_shape=input_shape,
                input_var=mean_network.input_layer.input_var,
                output_dim=output_dim,
                conv_filters=std_conv_filters,
                conv_filter_sizes=std_conv_filter_sizes,
                conv_strides=std_conv_strides,
                conv_pads=std_conv_pads,
                hidden_sizes=std_hidden_sizes,
                hidden_nonlinearity=std_nonlinearity,
                output_nonlinearity=None,
            ).output_layer
        else:
            l_log_std = ParamLayer(
                mean_network.input_layer,
                num_units=output_dim,
                param=lasagne.init.Constant(np.log(init_std)),
                name="output_log_std",
                trainable=learn_std,
            )

        LasagnePowered.__init__(self, [l_mean, l_log_std])

        xs_var = mean_network.input_layer.input_var
        ys_var = TT.matrix("ys")
        old_means_var = TT.matrix("old_means")
        old_log_stds_var = TT.matrix("old_log_stds")

        x_mean_var = theano.shared(
            np.zeros((1,np.prod(input_shape)), dtype=theano.config.floatX),
            name="x_mean",
            broadcastable=(True,False),
        )
        x_std_var = theano.shared(
            np.ones((1,np.prod(input_shape)), dtype=theano.config.floatX),
            name="x_std",
            broadcastable=(True,False),
        )
        y_mean_var = theano.shared(
            np.zeros((1, output_dim), dtype=theano.config.floatX),
            name="y_mean",
            broadcastable=(True, False)
        )
        y_std_var = theano.shared(
            np.ones((1, output_dim), dtype=theano.config.floatX),
            name="y_std",
            broadcastable=(True, False)
        )

        normalized_xs_var = (xs_var - x_mean_var) / x_std_var
        normalized_ys_var = (ys_var - y_mean_var) / y_std_var

        normalized_means_var = L.get_output(
            l_mean, {mean_network.input_layer: normalized_xs_var})
        normalized_log_stds_var = L.get_output(
            l_log_std, {mean_network.input_layer: normalized_xs_var})

        means_var = normalized_means_var * y_std_var + y_mean_var
        log_stds_var = normalized_log_stds_var + TT.log(y_std_var)

        normalized_old_means_var = (old_means_var - y_mean_var) / y_std_var
        normalized_old_log_stds_var = old_log_stds_var - TT.log(y_std_var)

        dist = self._dist = DiagonalGaussian(output_dim)

        normalized_dist_info_vars = dict(
            mean=normalized_means_var, log_std=normalized_log_stds_var)

        mean_kl = TT.mean(dist.kl_sym(
            dict(mean=normalized_old_means_var,
                 log_std=normalized_old_log_stds_var),
            normalized_dist_info_vars,
        ))

        loss = - \
            TT.mean(dist.log_likelihood_sym(
                normalized_ys_var, normalized_dist_info_vars))

        self._f_predict = compile_function([xs_var], means_var)
        self._f_pdists = compile_function([xs_var], [means_var, log_stds_var])
        self._l_mean = l_mean
        self._l_log_std = l_log_std

        optimizer_args = dict(
            loss=loss,
            target=self,
            network_outputs=[normalized_means_var, normalized_log_stds_var],
        )

        if use_trust_region:
            optimizer_args["leq_constraint"] = (mean_kl, step_size)
            optimizer_args["inputs"] = [
                xs_var, ys_var, old_means_var, old_log_stds_var]
        else:
            optimizer_args["inputs"] = [xs_var, ys_var]

        self._optimizer.update_opt(**optimizer_args)

        self._use_trust_region = use_trust_region
        self._name = name

        self._normalize_inputs = normalize_inputs
        self._normalize_outputs = normalize_outputs
        self._mean_network = mean_network
        self._x_mean_var = x_mean_var
        self._x_std_var = x_std_var
        self._y_mean_var = y_mean_var
        self._y_std_var = y_std_var
        self._subsample_factor = subsample_factor
        self._batchsize = batchsize