Beispiel #1
0
    def __init__(self, controller, num_shifts=3, memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x),
                 W_hid_to_erase=lasagne.init.GlorotUniform(),
                 b_hid_to_erase=lasagne.init.Constant(0.),
                 nonlinearity_erase=nonlinearities.hard_sigmoid,
                 W_hid_to_add=lasagne.init.GlorotUniform(),
                 b_hid_to_add=lasagne.init.Constant(0.),
                 nonlinearity_add=nonlinearities.ClippedLinear(low=0., high=1.),
                 W_hid_to_sign_add=None,
                 b_hid_to_sign_add=lasagne.init.Constant(0.),
                 nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1., high=1.),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(WriteHead, self).__init__(controller, num_shifts=num_shifts, memory_shape=memory_shape,
            W_hid_to_sign=W_hid_to_sign, b_hid_to_sign=b_hid_to_sign, nonlinearity_sign=nonlinearity_sign,
            W_hid_to_key=W_hid_to_key, b_hid_to_key=b_hid_to_key, nonlinearity_key=nonlinearity_key,
            W_hid_to_beta=W_hid_to_beta, b_hid_to_beta=b_hid_to_beta, nonlinearity_beta=nonlinearity_beta,
            W_hid_to_gate=W_hid_to_gate, b_hid_to_gate=b_hid_to_gate, nonlinearity_gate=nonlinearity_gate,
            W_hid_to_shift=W_hid_to_shift, b_hid_to_shift=b_hid_to_shift, nonlinearity_shift=nonlinearity_shift,
            W_hid_to_gamma=W_hid_to_gamma, b_hid_to_gamma=b_hid_to_gamma, nonlinearity_gamma=nonlinearity_gamma,
            weights_init=weights_init, learn_init=learn_init, **kwargs)
    
        self.erase = DenseLayer(controller, num_units=self.memory_shape[1],
            W=W_hid_to_erase, b=b_hid_to_erase, nonlinearity=nonlinearity_erase,
            name=self.basename + '.erase')
        self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b

        self.add = DenseLayer(controller, num_units=self.memory_shape[1],
            W=W_hid_to_add, b=b_hid_to_add, nonlinearity=nonlinearity_add,
            name=self.basename + '.add')
        self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b

        if W_hid_to_sign_add is not None:
            self.sign_add = DenseLayer(controller, num_units=self.memory_shape[1],
                W=W_hid_to_sign_add, b=b_hid_to_sign_add, nonlinearity=nonlinearity_sign_add,
                name=self.basename + '.sign_add')
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b
        else:
            self.sign_add = None
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None
Beispiel #2
0
    def __init__(self, num_hidden, num_features, seq_length, mb_size, tf_states, rf_states):
        
        tf_states = T.specify_shape(tf_states, (seq_length, mb_size, num_features))
        rf_states = T.specify_shape(rf_states, (seq_length, mb_size, num_features))

        hidden_state_features = T.specify_shape(T.concatenate([tf_states, rf_states], axis = 1), (seq_length, mb_size * 2, num_features))

        gru_params_1 = init_tparams(param_init_gru(None, {}, prefix = "gru1", dim = num_hidden, nin = num_features))
        #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features))
        #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features))

        gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix = 'gru1')[0]
        #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0]
        #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0]

        final_out_recc = T.specify_shape(T.mean(gru_1_out, axis = 0), (mb_size * 2, num_hidden))

        h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units = 1, nonlinearity=None)

        h_out_1_value = h_out_1.get_output_for(final_out_recc)
        h_out_4_value = h_out_4.get_output_for(h_out_1_value)

        raw_y = h_out_4_value
        #raw_y = T.clip(h_out_4_value, -10.0, 10.0)
        classification = T.nnet.sigmoid(raw_y)

        #tf comes before rf.  
        p_real =  classification[:mb_size]
        p_gen  = classification[mb_size:]

        #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r))

        self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean()
        self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean()
        self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean()
        self.d_cost = self.d_cost_real + self.d_cost_gen
        self.g_cost = self.g_cost_d


        self.classification = classification

        self.params = []
        self.params += lasagne.layers.get_all_params(h_out_4,trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True)
        self.params += lasagne.layers.get_all_params(h_out_1,trainable=True)

        self.params += gru_params_1.values()
        #self.params += gru_params_2.values()
        #self.params += gru_params_3.values()

        self.accuracy = T.mean(T.eq(T.ones(p_real.shape).flatten(), T.gt(p_real, 0.5).flatten())) + T.mean(T.eq(T.ones(p_gen.shape).flatten(), T.lt(p_gen, 0.5).flatten()))
def discriminator(x, z, params, mb_size, num_hidden, num_latent):

    x_z = T.concatenate([x,z], axis = 1)


    h_out_1 = DenseLayer((mb_size, num_hidden + num_latent), num_units = num_hidden, nonlinearity=None, W = params['W_disc_1'])

    h_out_2 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_2'])

    h_out_3 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_3'])

    h_out_4 = DenseLayer((mb_size, 1), num_units = 1, nonlinearity=None, W = params['W_disc_4'], b = params['b_disc_4'])

    h_out_1_value = h_out_1.get_output_for(x_z)

    h_out_1_value = T.maximum(0.0, (h_out_1_value - T.mean(h_out_1_value, axis = 0)) / (1.0 + T.std(h_out_1_value, axis = 0)) + params['b_disc_1'])

    h_out_2_value = h_out_2.get_output_for(h_out_1_value)

    h_out_2_value = T.maximum(0.0, (h_out_2_value - T.mean(h_out_2_value, axis = 0)) / (1.0 + T.std(h_out_2_value, axis = 0)) + params['b_disc_2'])

    h_out_3_value = h_out_3.get_output_for(h_out_2_value)

    h_out_3_value = T.maximum(0.0, (h_out_3_value - T.mean(h_out_3_value, axis = 0)) / (1.0 + T.std(h_out_3_value, axis = 0)) + params['b_disc_3'])

    h_out_4_value = h_out_4.get_output_for(h_out_3_value)

    raw_y = h_out_4_value

    classification = T.nnet.sigmoid(raw_y)

    results = {'c' : classification}

    return results
    def __init__(self, n_inputs, n_outputs, regression, multiclass=False, depth=5, n_estimators=20, n_hidden=128, learning_rate=0.01, num_epochs=500, pi_iters=20, sgd_iters=10, batch_size=1000, momentum=0.0, dropout=0.0, loss=None, update=adagrad):
        """
        Parameters
        ----------
        n_inputs : number of input features
        n_outputs : number of classes to predict (1 for regression)
            for 2 class classification n_outputs should be 2, not 1
        regression : True for regression, False for classification
        multiclass : not used
        depth : depth of each tree in the ensemble
        n_estimators : number of trees in the ensemble
        n_hidden : number of neurons in the hidden layer
        pi_iters : number of iterations for the iterative algorithm that updates pi
        sgd_iters : number of full iterations of sgd between two consequtive updates of pi
        loss : theano loss function. If None, squared error will be used for regression and
            cross entropy will be used for classification
        update : theano update function
        """
        self._depth = depth
        self._n_estimators = n_estimators
        self._n_hidden = n_hidden
        self._n_outputs = n_outputs
        self._loss = loss
        self._regression = regression
        self._multiclass = multiclass
        self._learning_rate = learning_rate
        self._num_epochs = num_epochs
        self._pi_iters = pi_iters
        self._sgd_iters = sgd_iters
        self._batch_size = batch_size
        self._momentum = momentum
        self._update = update

        self.t_input = T.matrix('input')
        self.t_label = T.matrix('output')

        self._cached_trainable_params = None
        self._cached_params = None

        self._n_net_out = n_estimators * ((1 << depth) - 1)

        self.l_input = InputLayer((None, n_inputs))
        self.l_dense1 = DenseLayer(self.l_input, self._n_hidden, nonlinearity=rectify)
        if dropout != 0:
            self.l_dense1 = DropoutLayer(self.l_dense1, p=dropout)
        if not __DEBUG_NO_FOREST__:
            self.l_dense2 = DenseLayer(self.l_dense1, self._n_net_out, nonlinearity=sigmoid)
            self.l_forest = NeuralForestLayer(self.l_dense2, self._depth, self._n_estimators, self._n_outputs, self._pi_iters)
        else:
            self.l_forest = DenseLayer(self.l_dense1, self._n_outputs, nonlinearity=softmax)
def decoder(z, params, config):

    mb_size = config['mb_size']
    num_latent = config['num_latent']
    num_hidden = config['num_hidden']

    h_out_1 = HiddenLayer(num_in = num_latent, num_out = num_hidden, W = params['W_dec_1'], b = params['b_dec_1'], activation = 'relu', batch_norm = True)

    h_out_2 = HiddenLayer(num_in = num_hidden, num_out = num_hidden, W = params['W_dec_2'], b = params['b_dec_2'], activation = 'relu', batch_norm = True)

    h_out_3 = DenseLayer((mb_size, num_hidden), num_units = 4000, nonlinearity=None, W = params['W_dec_3'], b = params['b_dec_3'])

    h_out_1_value = h_out_1.output(z)
    h_out_2_value = h_out_2.output(h_out_1_value)
    h_out_3_value = h_out_3.get_output_for(h_out_2_value)

    return {'h' : h_out_3_value}
Beispiel #6
0
def define_network(x, params, config):

    num_hidden = config['num_hidden']
    mb_size = config['mb_size']
    num_latent = config['num_latent']

    enc = encoder(x, params, config)

    mean_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_mean_W'], b = params['z_mean_b'])
    #std_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_std_W'], b = params['z_std_b'])

    mean = mean_layer.get_output_for(enc['h'])
    #std = T.exp(std_layer.get_output_for(enc['h']))

    import random as rng
    srng = theano.tensor.shared_randomstreams.RandomStreams(420)

    z_sampled = srng.normal(size = mean.shape, avg = 0.0, std = 1.0)
    z_extra = 0.0 * srng.normal(size = mean.shape, avg = 0.0, std = 1.0)

    z_reconstruction = mean

    #z_var = std**2
    z_loss = 0.0 * T.sum(mean)#0.001 * 0.5 * T.sum(mean**2 + z_var - T.log(z_var) - 1.0)

    dec_reconstruction = decoder(z_reconstruction, z_extra, params, config)
    dec_sampled = decoder(z_sampled, z_extra, params, config)

    interp_lst = []

    for j in range(0,128):
        interp_lst.append(z_reconstruction[0] * (j/128.0) + z_reconstruction[-1] * (1 - j / 128.0))

    z_interp = T.concatenate([interp_lst], axis = 1)

    dec_interp = decoder(z_interp, z_extra, params, config)

    results_map = {'reconstruction' : dec_reconstruction['h'], 'z_loss' : z_loss, 'sample' : dec_sampled['h'], 'interp' : dec_interp['h'], 'z' : z_reconstruction}

    return results_map
Beispiel #7
0
    def test_get_all_params(self):
        from lasagne.layers import (InputLayer, DenseLayer, get_all_params)
        l1 = InputLayer((10, 20))
        l2 = DenseLayer(l1, 30)
        l3 = DenseLayer(l2, 40)

        assert get_all_params(l3) == l2.get_params() + l3.get_params()
        assert (get_all_params(l3, regularizable=False) ==
                (l2.get_params(regularizable=False) +
                 l3.get_params(regularizable=False)))

        assert (get_all_params(l3, regularizable=True) ==
                (l2.get_params(regularizable=True) +
                 l3.get_params(regularizable=True)))
Beispiel #8
0
    def __init__(self, controller, num_shifts=3, memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(Head, self).__init__(controller, **kwargs)

        self.memory_shape = memory_shape
        self.basename = kwargs.get('name', 'head')
        self.learn_init = learn_init

        if W_hid_to_sign is not None:
            self.sign = DenseLayer(controller, num_units=self.memory_shape[1],
                W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign,
                name=self.basename + '.sign')
            self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b
        else:
            self.sign = None
            self.W_hid_to_sign, self.b_hid_to_sign = None, None

        self.key = DenseLayer(controller, num_units=self.memory_shape[1],
            W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key,
            name=self.basename + '.key')
        self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b
        
        self.beta = DenseLayer(controller, num_units=1,
            W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta,
            name=self.basename + '.beta')
        self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b

        self.gate = DenseLayer(controller, num_units=1,
            W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate,
            name=self.basename + '.gate')
        self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b

        self.num_shifts = num_shifts
        self.shift = DenseLayer(controller, num_units=num_shifts,
            W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift,
            name=self.basename + '.shift')
        self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b

        self.gamma = DenseLayer(controller, num_units=1,
            W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma,
            name=self.basename + '.gamma')
        self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b

        self.weights_init = self.add_param(
            weights_init, (1, self.memory_shape[0]),
            name='weights_init', trainable=learn_init, regularizable=False)
Beispiel #9
0
class WriteHead(Head):
    r"""
    Write head. In addition to the weight vector, the write head
    also outputs an add vector :math:`a_{t}` and an erase vector
    :math:`e_{t}` defined by

    .. math ::
        \delta_{t} &= \sigma_{delta}(h_{t} W_{delta} + b_{delta})\\
        a_{t} &= \delta_{t} * \sigma_{a}(h_{t} W_{a} + b_{a})
        e_{t} &= \sigma_{e}(h_{t} W_{e} + b_{e})

    Parameters
    ----------
    controller: a :class:`Controller` instance
        The controller of the Neural Turing Machine.
    num_shifts: int
        Number of shifts allowed by the convolutional shift operation
        (centered on 0, eg. ``num_shifts=3`` represents shifts
        in [-1, 0, 1]).
    memory_shape: tuple
        Shape of the NTM's memory
    W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
    b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_sign: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\alpha_{t}`.
    W_hid_to_key: callable, Numpy array or Theano shared variable
    b_hid_to_key: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_key: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`k_{t}`.
    W_hid_to_beta: callable, Numpy array or Theano shared variable
    b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_beta: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\beta_{t}`.
    W_hid_to_gate: callable, Numpy array or Theano shared variable
    b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gate: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`g_{t}`.
    W_hid_to_shift: callable, Numpy array or Theano shared variable
    b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_shift: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`s_{t}`.
    W_hid_to_gamma: callable, Numpy array or Theano shared variable
    b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gamma: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\gamma_{t}`
    W_hid_to_erase: callable, Numpy array or Theano shared variable
    b_hid_to_erase: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_erase: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`e_{t}`
    W_hid_to_add: callable, Numpy array or Theano shared variable
    b_hid_to_add: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_add: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`a_{t}`
    W_hid_to_sign_add: callable, Numpy array, Theano shared variable, or ``None``
    b_hid_to_sign_add: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_sign_add: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\delta_{t}`
    weights_init: callable, Numpy array or Theano shared variable
        Initializer for the initial weight vector (:math:`w_{0}`).
    learn_init: bool
        If ``True``, initial hidden values are learned.
    """
    def __init__(self, controller, num_shifts=3, memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x),
                 W_hid_to_erase=lasagne.init.GlorotUniform(),
                 b_hid_to_erase=lasagne.init.Constant(0.),
                 nonlinearity_erase=nonlinearities.hard_sigmoid,
                 W_hid_to_add=lasagne.init.GlorotUniform(),
                 b_hid_to_add=lasagne.init.Constant(0.),
                 nonlinearity_add=nonlinearities.ClippedLinear(low=0., high=1.),
                 W_hid_to_sign_add=None,
                 b_hid_to_sign_add=lasagne.init.Constant(0.),
                 nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1., high=1.),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(WriteHead, self).__init__(controller, num_shifts=num_shifts, memory_shape=memory_shape,
            W_hid_to_sign=W_hid_to_sign, b_hid_to_sign=b_hid_to_sign, nonlinearity_sign=nonlinearity_sign,
            W_hid_to_key=W_hid_to_key, b_hid_to_key=b_hid_to_key, nonlinearity_key=nonlinearity_key,
            W_hid_to_beta=W_hid_to_beta, b_hid_to_beta=b_hid_to_beta, nonlinearity_beta=nonlinearity_beta,
            W_hid_to_gate=W_hid_to_gate, b_hid_to_gate=b_hid_to_gate, nonlinearity_gate=nonlinearity_gate,
            W_hid_to_shift=W_hid_to_shift, b_hid_to_shift=b_hid_to_shift, nonlinearity_shift=nonlinearity_shift,
            W_hid_to_gamma=W_hid_to_gamma, b_hid_to_gamma=b_hid_to_gamma, nonlinearity_gamma=nonlinearity_gamma,
            weights_init=weights_init, learn_init=learn_init, **kwargs)
    
        self.erase = DenseLayer(controller, num_units=self.memory_shape[1],
            W=W_hid_to_erase, b=b_hid_to_erase, nonlinearity=nonlinearity_erase,
            name=self.basename + '.erase')
        self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b

        self.add = DenseLayer(controller, num_units=self.memory_shape[1],
            W=W_hid_to_add, b=b_hid_to_add, nonlinearity=nonlinearity_add,
            name=self.basename + '.add')
        self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b

        if W_hid_to_sign_add is not None:
            self.sign_add = DenseLayer(controller, num_units=self.memory_shape[1],
                W=W_hid_to_sign_add, b=b_hid_to_sign_add, nonlinearity=nonlinearity_sign_add,
                name=self.basename + '.sign_add')
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b
        else:
            self.sign_add = None
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None

    def get_params(self, **tags):
        params = super(WriteHead, self).get_params(**tags)
        params += self.erase.get_params(**tags)
        params += self.add.get_params(**tags)
        if self.sign_add is not None:
            params += self.sign_add.get_params(**tags)

        return params
    def __init__(self,
                 feature_shape,
                 latent_size,
                 hidden_structure,
                 reconstruction_distribution=None,
                 number_of_reconstruction_classes=None,
                 use_count_sum=False):

        self.use_count_sum = use_count_sum and \
            (reconstruction_distribution != "bernoulli")

        print("Setting up model.")
        print("    feature size: {}".format(feature_shape))
        print("    latent size: {}".format(latent_size))
        print("    hidden structure: {}".format(", ".join(
            map(str, hidden_structure))))
        if type(reconstruction_distribution) == str:
            print("    reconstruction distribution: " +
                  reconstruction_distribution)
        else:
            print("    reconstruction distribution: custom")
        if number_of_reconstruction_classes > 0:
            print(
                "    reconstruction classes: {}".format(
                    number_of_reconstruction_classes), " (including 0s)")
        if self.use_count_sum:
            print("    using count sums")
        print("")

        # Setup

        super(VariationalAutoEncoderForCounts, self).__init__()

        self.feature_shape = feature_shape
        self.latent_size = latent_size
        self.hidden_structure = hidden_structure

        symbolic_x = T.matrix('x')  # counts
        symbolic_z = T.matrix('z')  # latent variable

        self.number_of_epochs_trained = 0
        symbolic_learning_rate = T.scalar("epsilon")
        self.learning_curves = {
            "training": {
                "LB": [],
                "ENRE": [],
                "KL": []
            },
            "validation": {
                "LB": [],
                "ENRE": [],
                "KL": []
            }
        }

        if reconstruction_distribution:

            if type(reconstruction_distribution) == str:
                if number_of_reconstruction_classes > 0:
                    reconstruction_distribution = "softmax_" + \
                        reconstruction_distribution
                    self.k_max = number_of_reconstruction_classes - 1
                    reconstruction_distribution = \
                        reconstruction_distributions[reconstruction_distribution]
                    reconstruction_distribution = \
                        reconstruction_distribution(self.k_max)
                else:
                    reconstruction_distribution = \
                        reconstruction_distributions[reconstruction_distribution]

            self.x_parameters = reconstruction_distribution["parameters"]
            self.reconstruction_activation_functions = \
                reconstruction_distribution["activation functions"]

            self.expectedNegativeReconstructionError = \
                reconstruction_distribution["function"]
            self.meanOfReconstructionDistribution = reconstruction_distribution[
                "mean"]
            self.preprocess = reconstruction_distribution["preprocess"]
        else:
            reconstruction_distribution = "Gaussian (default)"

            # Use a Gaussian distribution as standard
            self.x_parameters = ["mu", "sigma"]
            self.reconstruction_activation_functions = {
                "mu": identity,
                "sigma": identity
            }
            self.expectedNegativeReconstructionError = lambda x, x_theta, eps = 0.0: \
                log_normal(x, x_theta["mu"], x_theta["sigma"], eps)
            self.meanOfReconstructionDistribution = lambda x_theta: x_theta[
                "mu"]
            self.preprocess = lambda x: x

        # if number_of_reconstruction_classes > 0:
        #
        #     self.x_parameters += ["p_k"]
        #     self.reconstruction_activation_functions["p_k"] = softmax
        #     log_distribution = self.expectedNegativeReconstructionError
        #     self.expectedNegativeReconstructionError = lambda x, x_theta, eps = 0.0: \
        #         log_cross_entropy_extended(x, x_theta,
        #             log_distribution, k_max = number_of_reconstruction_classes - 1,
        #             eps = 0.0)
        #     mean_of_distribution = self.meanOfReconstructionDistribution
        #     self.meanOfReconstructionDistribution = lambda x_theta: \
        #         meanOfCrossEntropyExtendedDistibution(x_theta,
        #             mean_of_distribution, k_max = number_of_reconstruction_classes - 1)
        #     self.k_max = number_of_reconstruction_classes - 1

        if self.use_count_sum:
            symbolic_n = T.matrix('n')  # sum of counts

        # Models

        ## Recognition model q(z|x)

        l_enc_in = InputLayer(shape=(None, feature_shape), name="ENC_INPUT")
        l_enc = l_enc_in

        for i, hidden_size in enumerate(hidden_structure):
            l_enc = DenseLayer(l_enc,
                               num_units=hidden_size,
                               nonlinearity=rectify,
                               name='ENC_DENSE{:d}'.format(i + 1))

        l_z_mu = DenseLayer(l_enc,
                            num_units=latent_size,
                            nonlinearity=None,
                            name='ENC_Z_MU')
        l_z_log_var = DenseLayer(l_enc,
                                 num_units=latent_size,
                                 nonlinearity=lambda x: T.clip(x, -10, 10),
                                 name='ENC_Z_LOG_VAR')

        # Sample a latent representation z \sim q(z|x) = N(mu(x), logvar(x))
        l_z = SimpleSampleLayer(mean=l_z_mu,
                                log_var=l_z_log_var,
                                name="ENC_SAMPLE")

        self.encoder = l_z

        ## Generative model p(x|z)

        l_dec_z_in = InputLayer(shape=(None, latent_size), name="DEC_INPUT")

        if self.use_count_sum:
            l_dec_n_in = InputLayer(shape=(None, 1), name="DEC_N_INPUT")
            l_dec = ConcatLayer([l_dec_z_in, l_dec_n_in],
                                axis=1,
                                name="DEC_MERGE_INPUT")
        else:
            l_dec = l_dec_z_in

        for i, hidden_size in enumerate(reversed(hidden_structure)):
            l_dec = DenseLayer(
                l_dec,
                num_units=hidden_size,
                nonlinearity=rectify,
                name='DEC_DENSE{:d}'.format(len(hidden_structure) - i))

        l_x_theta = {}

        for p in self.x_parameters:
            p_name = 'DEC_X_' + p.upper()
            if self.reconstruction_activation_functions[p] == softmax:
                l_dense = DenseLayer(l_dec,
                                     num_units=feature_shape *
                                     (self.k_max + 1),
                                     nonlinearity=identity,
                                     name=p_name + "_DENSE")
                l_reshape = ReshapeLayer(l_dense, (-1, (self.k_max + 1)))
                l_softmax = DenseLayer(l_reshape,
                                       num_units=(self.k_max + 1),
                                       nonlinearity=softmax,
                                       name=p_name + "_SOFTMAX")
                l_x_theta[p] = ReshapeLayer(l_softmax, (-1, feature_shape,
                                                        (self.k_max + 1)))
            else:
                l_x_theta[p] = DenseLayer(
                    l_dec,
                    num_units=feature_shape,
                    nonlinearity=self.reconstruction_activation_functions[p],
                    name=p_name)

        self.decoder = {p: l_x_theta[p] for p in self.x_parameters}

        ## Get outputs from models

        ## Training outputs
        z_train, z_mu_train, z_log_var_train = get_output(
            [l_z, l_z_mu, l_z_log_var], {l_enc_in: symbolic_x},
            deterministic=False)
        inputs = {l_dec_z_in: z_train}
        if self.use_count_sum:
            inputs[l_dec_n_in] = symbolic_n
        x_theta_train = get_output([l_x_theta[p] for p in self.x_parameters],
                                   inputs,
                                   deterministic=False)
        x_theta_train = {
            p: o
            for p, o in zip(self.x_parameters, x_theta_train)
        }

        ## Evaluation outputs
        z_eval, z_mu_eval, z_log_var_eval = get_output(
            [l_z, l_z_mu, l_z_log_var], {l_enc_in: symbolic_x},
            deterministic=True)
        inputs = {l_dec_z_in: z_eval}
        if self.use_count_sum:
            inputs[l_dec_n_in] = symbolic_n
        x_theta_eval = get_output([l_x_theta[p] for p in self.x_parameters],
                                  inputs,
                                  deterministic=True)
        x_theta_eval = {p: o for p, o in zip(self.x_parameters, x_theta_eval)}

        ## Sample outputs

        inputs = {l_dec_z_in: symbolic_z}
        if self.use_count_sum:
            inputs[l_dec_n_in] = symbolic_n
        x_theta_sample = get_output([l_x_theta[p] for p in self.x_parameters],
                                    inputs,
                                    deterministic=True)
        x_theta_sample = {
            p: o
            for p, o in zip(self.x_parameters, x_theta_sample)
        }

        # Likelihood

        lower_bound_train, log_p_x_train, KL__train = \
            self.lowerBound(symbolic_x, x_theta_train, z_mu_train, z_log_var_train)
        lower_bound_eval, log_p_x_eval, KL__eval = \
            self.lowerBound(symbolic_x, x_theta_eval, z_mu_eval, z_log_var_eval)

        all_parameters = get_all_params(
            [l_z] + [l_x_theta[p] for p in self.x_parameters], trainable=True)

        print("Parameters to train:")
        for parameter in all_parameters:
            print("    {}: {}".format(parameter, parameter.get_value().shape))

        # Let Theano do its magic and get all the gradients we need for training
        all_gradients = T.grad(-lower_bound_train, all_parameters)

        # Set the update function for parameters. The Adam optimizer works really well with VAEs.
        update_expressions = updates.adam(all_gradients,
                                          all_parameters,
                                          learning_rate=symbolic_learning_rate)

        inputs = [symbolic_x]
        if self.use_count_sum:
            inputs.append(symbolic_n)
        inputs.append(symbolic_learning_rate)

        self.f_train = theano.function(
            inputs=inputs,
            outputs=[lower_bound_train, log_p_x_train, KL__train],
            updates=update_expressions)

        inputs = [symbolic_x]
        if self.use_count_sum:
            inputs.append(symbolic_n)

        self.f_eval = theano.function(
            inputs=inputs, outputs=[lower_bound_eval, log_p_x_eval, KL__eval])

        self.f_z = theano.function(inputs=[symbolic_x], outputs=[z_eval])

        inputs = [symbolic_z]
        if self.use_count_sum:
            inputs.append(symbolic_n)

        self.f_sample = theano.function(
            inputs=inputs,
            outputs=[x_theta_sample[p] for p in self.x_parameters])

        inputs = [symbolic_x]
        if self.use_count_sum:
            inputs.append(symbolic_n)

        self.f_recon = theano.function(
            inputs=inputs,
            outputs=[x_theta_eval[p] for p in self.x_parameters])
Beispiel #11
0
    return net


# Load model weights and metadata
d = pickle.load(open('../input/pretrained/vgg19.pkl'))

# Build the network and fill with pretrained weights
net = build_model()

# Define loss function and metrics, and get an updates dictionary
X_sym = T.tensor4()
y_sym = T.ivector()

# We'll connect our output classifier to the last fully connected layer of the network
net['new_output'] = DenseLayer(net['drop7'],
                               num_units=8,
                               nonlinearity=softmax,
                               W=lasagne.init.Normal(0.01))

prediction = lasagne.layers.get_output(net['new_output'], X_sym)
loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym)
loss = loss.mean()

acc = T.mean(T.eq(T.argmax(prediction, axis=1), y_sym),
             dtype=theano.config.floatX)

learning_rate = theano.shared(np.array(0.0003, dtype=theano.config.floatX))
learning_rate_decay = np.array(0.3, dtype=theano.config.floatX)
updates = OrderedDict()

for name, layer in net.items():
    layer_params = layer.get_params(trainable=True)
Beispiel #12
0
def classif(X, y):
    l = InputLayer(shape=(None, X.shape[1]))
    l = DenseLayer(l, num_units=len(np.unique(y)), nonlinearity=softmax)
    net = NeuralNet(l, update_learning_rate=0.01)
    net.fit(X, y)
    print(net.score(X, y))
Beispiel #13
0
 def test_initialization_with_layer_instance_bad_params(self, NeuralNet):
     layer = DenseLayer(InputLayer(shape=(128, 13)), num_units=2)
     nn = NeuralNet(layers=layer, dense1_num_units=3)
     with pytest.raises(ValueError):
         nn.initialize_layers()
Beispiel #14
0
    def __init__(self,
                 controller,
                 num_shifts=3,
                 memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1.,
                                                                high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0.,
                                                               high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.
                 rectify(x),
                 W_hid_to_erase=lasagne.init.GlorotUniform(),
                 b_hid_to_erase=lasagne.init.Constant(0.),
                 nonlinearity_erase=nonlinearities.hard_sigmoid,
                 W_hid_to_add=lasagne.init.GlorotUniform(),
                 b_hid_to_add=lasagne.init.Constant(0.),
                 nonlinearity_add=nonlinearities.ClippedLinear(low=0.,
                                                               high=1.),
                 W_hid_to_sign_add=None,
                 b_hid_to_sign_add=lasagne.init.Constant(0.),
                 nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1.,
                                                                    high=1.),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(WriteHead, self).__init__(controller,
                                        num_shifts=num_shifts,
                                        memory_shape=memory_shape,
                                        W_hid_to_sign=W_hid_to_sign,
                                        b_hid_to_sign=b_hid_to_sign,
                                        nonlinearity_sign=nonlinearity_sign,
                                        W_hid_to_key=W_hid_to_key,
                                        b_hid_to_key=b_hid_to_key,
                                        nonlinearity_key=nonlinearity_key,
                                        W_hid_to_beta=W_hid_to_beta,
                                        b_hid_to_beta=b_hid_to_beta,
                                        nonlinearity_beta=nonlinearity_beta,
                                        W_hid_to_gate=W_hid_to_gate,
                                        b_hid_to_gate=b_hid_to_gate,
                                        nonlinearity_gate=nonlinearity_gate,
                                        W_hid_to_shift=W_hid_to_shift,
                                        b_hid_to_shift=b_hid_to_shift,
                                        nonlinearity_shift=nonlinearity_shift,
                                        W_hid_to_gamma=W_hid_to_gamma,
                                        b_hid_to_gamma=b_hid_to_gamma,
                                        nonlinearity_gamma=nonlinearity_gamma,
                                        weights_init=weights_init,
                                        learn_init=learn_init,
                                        **kwargs)

        self.erase = DenseLayer(controller,
                                num_units=self.memory_shape[1],
                                W=W_hid_to_erase,
                                b=b_hid_to_erase,
                                nonlinearity=nonlinearity_erase,
                                name=self.basename + '.erase')
        self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b

        self.add = DenseLayer(controller,
                              num_units=self.memory_shape[1],
                              W=W_hid_to_add,
                              b=b_hid_to_add,
                              nonlinearity=nonlinearity_add,
                              name=self.basename + '.add')
        self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b

        if W_hid_to_sign_add is not None:
            self.sign_add = DenseLayer(controller,
                                       num_units=self.memory_shape[1],
                                       W=W_hid_to_sign_add,
                                       b=b_hid_to_sign_add,
                                       nonlinearity=nonlinearity_sign_add,
                                       name=self.basename + '.sign_add')
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b
        else:
            self.sign_add = None
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None
def lasagne_separate(M,
                     P,
                     FE,
                     W1,
                     W2,
                     z1,
                     z2,
                     hh=.0001,
                     ep=5000,
                     d=0,
                     wsp=.0001,
                     plt=True):
    from paris.signal import bss_eval

    # Gt dictionary shapes
    K = [W1.shape[0], W2.shape[0]]

    # GPU cached data
    _M = theano.shared(M.astype(float32))

    # Input is the learned dictionary set
    lW = hstack((W1.T, W2.T)).astype(float32)
    _lW = Th.matrix('_lW')
    fI = InputLayer(shape=lW.shape, input_var=_lW)

    # Split in two paths
    fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1)
    fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1)

    # Dropout?
    dfW1 = DropoutLayer(fW1, d)
    dfW2 = DropoutLayer(fW2, d)

    # Compute source modulators
    R1 = DenseLayer(dfW1,
                    num_units=M.shape[1],
                    nonlinearity=lambda x: psoftplus(x, 3.),
                    b=None)
    R2 = DenseLayer(dfW2,
                    num_units=M.shape[1],
                    nonlinearity=lambda x: psoftplus(x, 3.),
                    b=None)

    # Bring to standard orientation
    R = ElemwiseSumLayer([R1, R2])

    # Cost function
    cost = (_M*(Th.log(_M+eps) - Th.log( get_output( R)+eps)) - _M + get_output( R)).mean() \
       + wsp*(Th.mean( abs( R1.W))+Th.mean( abs( R2.W)))

    # Train it using Lasagne
    opt = downhill.build('rprop',
                         loss=cost,
                         inputs=[_lW],
                         params=get_all_params(R))
    train = downhill.Dataset(lW, batch_size=0)
    er = downhill_train(opt, train, hh, ep, None)[-1]

    # Get outputs
    _r = nget(R, _lW, lW) + eps
    _r1 = nget(R1, _lW, lW)
    _r2 = nget(R2, _lW, lW)
    o1 = FE.ife(_r1 * (M / _r), P)
    o2 = FE.ife(_r2 * (M / _r), P)
    sxr = bss_eval(o1, 0, vstack((z1, z2))) + bss_eval(o2, 1, vstack((z1, z2)))

    return o1, o2, (array(sxr[:3]) + array(sxr[3:])) / 2.
Beispiel #16
0
def create_network(available_actions_count):
    # Create the input variables
    s1 = tensor.tensor4("State")
    a = tensor.vector("Action", dtype="int32")
    q2 = tensor.vector("Q2")
    r = tensor.vector("Reward")
    isterminal = tensor.vector("IsTerminal", dtype="int8")

    # Create the input layer of the network.
    dqn = InputLayer(shape=[None, 1, resolution[0], resolution[1]],
                     input_var=s1)

    # Add 2 convolutional layers with ReLu activation
    dqn = Conv2DLayer(dqn,
                      num_filters=8,
                      filter_size=[6, 6],
                      nonlinearity=rectify,
                      W=HeUniform("relu"),
                      b=Constant(.1),
                      stride=3)
    dqn = Conv2DLayer(dqn,
                      num_filters=8,
                      filter_size=[3, 3],
                      nonlinearity=rectify,
                      W=HeUniform("relu"),
                      b=Constant(.1),
                      stride=2)

    # Add a single fully-connected layer.
    dqn = DenseLayer(dqn,
                     num_units=128,
                     nonlinearity=rectify,
                     W=HeUniform("relu"),
                     b=Constant(.1))

    # Add the output layer (also fully-connected).
    # (no nonlinearity as it is for approximating an arbitrary real function)
    dqn = DenseLayer(dqn, num_units=available_actions_count, nonlinearity=None)

    # Define the loss function
    q = get_output(dqn)
    # target differs from q only for the selected action. The following means:
    # target_Q(s,a) = r + gamma * max Q(s2,_) if isterminal else r
    target_q = tensor.set_subtensor(
        q[tensor.arange(q.shape[0]), a],
        r + discount_factor * (1 - isterminal) * q2)
    loss = squared_error(q, target_q).mean()

    # Update the parameters according to the computed gradient using RMSProp.
    params = get_all_params(dqn, trainable=True)
    updates = rmsprop(loss, params, learning_rate)

    # Compile the theano functions
    print("Compiling the network ...")
    function_learn = theano.function([s1, q2, a, r, isterminal],
                                     loss,
                                     updates=updates,
                                     name="learn_fn")
    function_get_q_values = theano.function([s1], q, name="eval_fn")
    function_get_best_action = theano.function([s1],
                                               tensor.argmax(q),
                                               name="test_fn")
    print("Network compiled.")

    def simple_get_best_action(state):
        return function_get_best_action(
            state.reshape([1, 1, resolution[0], resolution[1]]))

    # Returns Theano objects for the net and functions.
    return dqn, function_learn, function_get_q_values, simple_get_best_action
Beispiel #17
0
    def _get_l_out(self, input_vars):
        check_options(self.options)
        id_tag = (self.id + '/') if self.id else ''

        prev_output_var, mask_var = input_vars[-2:]
        color_input_vars = input_vars[:-2]

        context_len = self.context_len if hasattr(self, 'context_len') else 1
        l_color_repr, color_inputs = self.color_vec.get_input_layer(
            color_input_vars,
            recurrent_length=self.seq_vec.max_len - 1,
            cell_size=self.options.speaker_cell_size,
            context_len=context_len,
            id=self.id
        )
        l_hidden_color = dimshuffle(l_color_repr, (0, 2, 1))
        for i in range(1, self.options.speaker_hidden_color_layers + 1):
            l_hidden_color = NINLayer(
                l_hidden_color, num_units=self.options.speaker_cell_size,
                nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity],
                name=id_tag + 'hidden_color%d' % i)
        l_hidden_color = dimshuffle(l_hidden_color, (0, 2, 1))

        l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                                input_var=prev_output_var,
                                name=id_tag + 'prev_input')
        l_prev_embed = EmbeddingLayer(l_prev_out, input_size=len(self.seq_vec.tokens),
                                      output_size=self.options.speaker_cell_size,
                                      name=id_tag + 'prev_embed')
        l_in = ConcatLayer([l_hidden_color, l_prev_embed], axis=2, name=id_tag + 'color_prev')
        l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1),
                               input_var=mask_var, name=id_tag + 'mask_input')
        l_rec_drop = l_in

        cell = CELLS[self.options.speaker_cell]
        cell_kwargs = {
            'mask_input': (None if self.options.speaker_no_mask else l_mask_in),
            'grad_clipping': self.options.speaker_grad_clipping,
            'num_units': self.options.speaker_cell_size,
        }
        if self.options.speaker_cell == 'LSTM':
            cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.speaker_forget_bias))
        if self.options.speaker_cell != 'GRU':
            cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.speaker_nonlinearity]

        for i in range(1, self.options.speaker_recurrent_layers):
            l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs)
            if self.options.speaker_dropout > 0.0:
                l_rec_drop = DropoutLayer(l_rec, p=self.options.speaker_dropout,
                                          name=id_tag + 'rec%d_drop' % i)
            else:
                l_rec_drop = l_rec
        l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % self.options.speaker_recurrent_layers,
                     **cell_kwargs)
        l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size),
                               name=id_tag + 'reshape')
        l_hidden_out = l_shape
        for i in range(1, self.options.speaker_hidden_out_layers + 1):
            l_hidden_out = DenseLayer(
                l_hidden_out, num_units=self.options.speaker_cell_size,
                nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity],
                name=id_tag + 'hidden_out%d' % i)
        l_softmax = DenseLayer(l_hidden_out, num_units=len(self.seq_vec.tokens),
                               nonlinearity=softmax, name=id_tag + 'softmax')
        l_out = ReshapeLayer(l_softmax, (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)),
                             name=id_tag + 'out')

        return l_out, color_inputs + [l_prev_out, l_mask_in]
Beispiel #18
0
    def _build_disc(self):
        inputs = OrderedDict()
        inputs['x'] = InputLayer((None, 4, 64, 64))
        inputs['c'] = InputLayer((None, 843))
        inputs['v'] = InputLayer((None, 4))
        inputs['t'] = InputLayer((None, 8))

        layer_c = inputs['c']
        layer_c = DenseLayer(layer_c, 512, nonlinearity=leaky_rectify)
        layer_c.params[layer_c.W].add('dense')
        layer_c = (DenseLayer(layer_c, 512, nonlinearity=leaky_rectify))
        layer_c.params[layer_c.W].add('dense')

        layer_v = inputs['v']
        layer_v = DenseLayer(layer_v, 512, nonlinearity=leaky_rectify)
        layer_v.params[layer_v.W].add('dense')
        layer_v = (DenseLayer(layer_v, 512, nonlinearity=leaky_rectify))
        layer_v.params[layer_v.W].add('dense')

        layer_t = inputs['t']
        layer_t = DenseLayer(layer_t, 512, nonlinearity=leaky_rectify)
        layer_t.params[layer_t.W].add('dense')
        layer_t = (DenseLayer(layer_t, 512, nonlinearity=leaky_rectify))
        layer_t.params[layer_t.W].add('dense')

        layer_i = ConcatLayer([layer_c, layer_v, layer_t])
        layer_i = DenseLayer(layer_i, 1024, nonlinearity=leaky_rectify)
        layer_i.params[layer_i.W].add('dense')
        layer_i = DenseLayer(layer_i, 1024, nonlinearity=None)
        layer_i.params[layer_i.W].add('dense')

        layer_x = inputs['x']
        layer_x_n = layer_x
        layer_x = weight_norm(Conv2DLayer(layer_x_n, 64, 5, 2, 'same', nonlinearity=None, b=None))
        if self.reg: layer_x = dropout(layer_x)
        layer_x = NonlinearityLayer(layer_x, leaky_rectify)
        layer_x = weight_norm(Conv2DLayer(layer_x, 64, 5, 2, 'same', nonlinearity=None, b=None))
        if self.reg: layer_x = dropout(layer_x)
        layer_x = NonlinearityLayer(layer_x, leaky_rectify)
        layer_x = weight_norm(Conv2DLayer(layer_x, 128, 5, 2, 'same', nonlinearity=None, b=None))
        if self.reg: layer_x = dropout(layer_x)
        layer_x = NonlinearityLayer(layer_x, leaky_rectify)
        layer_x = weight_norm(Conv2DLayer(layer_x, 256, 5, 2, 'same', nonlinearity=None, b=None))
        layer_x = NonlinearityLayer(layer_x, leaky_rectify)

        layer_x = FlattenLayer(layer_x)
        layer_x = DenseLayer(layer_x, 1024, nonlinearity=leaky_rectify)
        layer_x.params[layer_x.W].add('dense')
        layer_x = DenseLayer(layer_x, 1024, nonlinearity=None)
        layer_x.params[layer_x.W].add('dense')

        layer = ElemwiseMergeLayer([layer_i, layer_x], T.mul)
        layer = ConcatLayer([layer, layer_x, layer_i])
        layer = DenseLayer(layer, 1024, nonlinearity=leaky_rectify)
        layer.params[layer.W].add('dense')

        layer_s = layer
        layer_s = DenseLayer(layer_s, 1, nonlinearity=None)
        layer_s.params[layer_s.W].add('dense')
        layer_s_0 = NonlinearityLayer(layer_s, nonlinearity=sigmoid)
        layer_s_1 = NonlinearityLayer(layer_s, nonlinearity=lambda x: x - T.log(1 + T.exp(x)))
        layer_s_2 = NonlinearityLayer(layer_s, nonlinearity=lambda x: -T.log(1 + T.exp(x)))

        outputs = OrderedDict()
        outputs['s'] = layer_s_0
        outputs['log(s)'] = layer_s_1
        outputs['log(1-s)'] = layer_s_2

        self.disc_inputs = inputs
        self.disc_outputs = outputs

        self.disc_inputs = inputs
        self.disc_outputs = outputs
Beispiel #19
0
    def _build_gen(self):
        size = 64
        s, s2, s4, s8, s16 = size, size // 2, size // 4, size // 8, size // 16
        inputs = OrderedDict()
        inputs['c'] = InputLayer((None, 843))
        inputs['v'] = InputLayer((None, 4))
        inputs['t'] = InputLayer((None, 8))

        layer_c = inputs['c']
        layer_c = DenseLayer(layer_c, 512, nonlinearity=rectify)
        layer_c.params[layer_c.W].add('dense')
        layer_c = DenseLayer(layer_c, 512, nonlinearity=rectify)
        layer_c.params[layer_c.W].add('dense')

        layer_v = inputs['v']
        layer_v = DenseLayer(layer_v, 512, nonlinearity=rectify)
        layer_v.params[layer_v.W].add('dense')
        layer_v = DenseLayer(layer_v, 512, nonlinearity=rectify)
        layer_v.params[layer_v.W].add('dense')

        layer_t = inputs['t']
        layer_t = DenseLayer(layer_t, 512, nonlinearity=rectify)
        layer_t.params[layer_t.W].add('dense')
        layer_t = DenseLayer(layer_t, 512, nonlinearity=rectify)
        layer_t.params[layer_t.W].add('dense')

        layer = ConcatLayer([layer_c, layer_v, layer_t])
        layer = DenseLayer(layer, 1024, nonlinearity=rectify)
        layer.params[layer.W].add('dense')
        layer = DenseLayer(layer, 1024, nonlinearity=rectify)
        layer.params[layer.W].add('dense')

        layer = DenseLayer(layer, 768 * s16 * s16, nonlinearity=rectify)
        layer.params[layer.W].add('dense')
        layer = ReshapeLayer(layer, (-1, 768, s16, s16))

        layer = InstanceNormalization(layer, True)
        layer = weight_norm(
            TransposedConv2DLayer(layer, 384, 5, 2, 'same', output_size=(s8, s8), nonlinearity=None, b=None),
            transposed=True)
        if self.reg: layer = dropout(layer)
        layer = NonlinearityLayer(layer, rectify)
        layer = weight_norm(
            TransposedConv2DLayer(layer, 256, 5, 2, 'same', output_size=(s4, s4), nonlinearity=None, b=None),
            transposed=True)
        if self.reg: layer = dropout(layer)
        layer = NonlinearityLayer(layer, rectify)
        layer = weight_norm(
            TransposedConv2DLayer(layer, 192, 5, 2, 'same', output_size=(s2, s2), nonlinearity=None, b=None),
            transposed=True)
        if self.reg: layer = dropout(layer)
        layer = NonlinearityLayer(layer, rectify)

        layer_img = TransposedConv2DLayer(layer, 3, 5, 2, 'same', output_size=(s, s), nonlinearity=tanh)
        layer_msk = TransposedConv2DLayer(layer, 1, 5, 2, 'same', output_size=(s, s), nonlinearity=sigmoid)

        layer = ConcatLayer([layer_img, layer_msk])
        outputs = OrderedDict()
        outputs['x'] = layer
        self.gen_inputs = inputs
        self.gen_outputs = outputs
Beispiel #20
0
def vgg16(input_var=None, image_size=256):
    from lasagne.layers import InputLayer
    from lasagne.layers import DenseLayer
    from lasagne.layers import NonlinearityLayer
    from lasagne.layers import DropoutLayer
    from lasagne.layers import Pool2DLayer as PoolLayer
    from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer
    from lasagne.nonlinearities import softmax

    net = {}
    net['input'] = InputLayer((None, 4, image_size, image_size),
                              input_var=input_var)
    net['conv1_1'] = ConvLayer(net['input'], 64, 3, pad=1, flip_filters=False)
    net['conv1_2'] = ConvLayer(net['conv1_1'],
                               64,
                               3,
                               pad=1,
                               flip_filters=False)
    net['pool1'] = PoolLayer(net['conv1_2'], 2)
    net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad=1, flip_filters=False)
    net['conv2_2'] = ConvLayer(net['conv2_1'],
                               128,
                               3,
                               pad=1,
                               flip_filters=False)
    net['pool2'] = PoolLayer(net['conv2_2'], 2)
    net['conv3_1'] = ConvLayer(net['pool2'], 256, 3, pad=1, flip_filters=False)
    net['conv3_2'] = ConvLayer(net['conv3_1'],
                               256,
                               3,
                               pad=1,
                               flip_filters=False)
    net['conv3_3'] = ConvLayer(net['conv3_2'],
                               256,
                               3,
                               pad=1,
                               flip_filters=False)
    net['pool3'] = PoolLayer(net['conv3_3'], 2)
    net['conv4_1'] = ConvLayer(net['pool3'], 512, 3, pad=1, flip_filters=False)
    net['conv4_2'] = ConvLayer(net['conv4_1'],
                               512,
                               3,
                               pad=1,
                               flip_filters=False)
    net['conv4_3'] = ConvLayer(net['conv4_2'],
                               512,
                               3,
                               pad=1,
                               flip_filters=False)
    net['pool4'] = PoolLayer(net['conv4_3'], 2)
    net['conv5_1'] = ConvLayer(net['pool4'], 512, 3, pad=1, flip_filters=False)
    net['conv5_2'] = ConvLayer(net['conv5_1'],
                               512,
                               3,
                               pad=1,
                               flip_filters=False)
    net['conv5_3'] = ConvLayer(net['conv5_2'],
                               512,
                               3,
                               pad=1,
                               flip_filters=False)
    net['pool5'] = PoolLayer(net['conv5_3'], 2)
    net['fc6'] = DenseLayer(net['pool5'], num_units=4096)
    net['fc6_dropout'] = DropoutLayer(net['fc6'], p=0.5)
    net['fc7'] = DenseLayer(net['fc6_dropout'], num_units=4096)
    net['fc7_dropout'] = DropoutLayer(net['fc7'], p=0.5)
    net['fc8'] = DenseLayer(net['fc7_dropout'],
                            num_units=1000,
                            nonlinearity=lasagne.nonlinearities.sigmoid)

    return net['fc8']
Beispiel #21
0
    def build_vgg_model():
        net = {}
        net['input'] = InputLayer((None, 3, 224, 224))
        net['conv1_1'] = ConvLayer(net['input'],
                                   64,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv1_2'] = ConvLayer(net['conv1_1'],
                                   64,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['pool1'] = PoolLayer(net['conv1_2'], 2)
        net['conv2_1'] = ConvLayer(net['pool1'],
                                   128,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv2_2'] = ConvLayer(net['conv2_1'],
                                   128,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['pool2'] = PoolLayer(net['conv2_2'], 2)
        net['conv3_1'] = ConvLayer(net['pool2'],
                                   256,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv3_2'] = ConvLayer(net['conv3_1'],
                                   256,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv3_3'] = ConvLayer(net['conv3_2'],
                                   256,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv3_4'] = ConvLayer(net['conv3_3'],
                                   256,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['pool3'] = PoolLayer(net['conv3_4'], 2)
        net['conv4_1'] = ConvLayer(net['pool3'],
                                   512,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv4_2'] = ConvLayer(net['conv4_1'],
                                   512,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv4_3'] = ConvLayer(net['conv4_2'],
                                   512,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv4_4'] = ConvLayer(net['conv4_3'],
                                   512,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['pool4'] = PoolLayer(net['conv4_4'], 2)
        net['conv5_1'] = ConvLayer(net['pool4'],
                                   512,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv5_2'] = ConvLayer(net['conv5_1'],
                                   512,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv5_3'] = ConvLayer(net['conv5_2'],
                                   512,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['conv5_4'] = ConvLayer(net['conv5_3'],
                                   512,
                                   3,
                                   pad=1,
                                   flip_filters=False)
        net['pool5'] = PoolLayer(net['conv5_4'], 2)
        net['fc6'] = DenseLayer(net['pool5'], num_units=4096)
        net['fc6_dropout'] = DropoutLayer(net['fc6'], p=0.5)
        net['fc7'] = DenseLayer(net['fc6_dropout'], num_units=4096)
        net['fc7_dropout'] = DropoutLayer(net['fc7'], p=0.5)
        net['fc8'] = DenseLayer(net['fc7_dropout'],
                                num_units=1000,
                                nonlinearity=None)
        net['prob'] = NonlinearityLayer(net['fc8'], softmax)

        # Remove the trainable argument from the layers that can potentially have it
        for key, val in net.iteritems():
            if not ('dropout' or 'pool' in key):
                net[key].params[net[key].W].remove("trainable")
                net[key].params[net[key].b].remove("trainable")

        return net
Beispiel #22
0
def build_convpool_mix(input_vars,
                       nb_classes,
                       grad_clip=110,
                       imsize=32,
                       n_colors=3,
                       n_timewin=3):
    """
    Builds the complete network with LSTM and 1D-conv layers combined

    :param input_vars: list of EEG images (one image per time window)
    :param nb_classes: number of classes
    :param grad_clip:  the gradient messages are clipped to the given value during
                        the backward pass.
    :param imsize: size of the input image (assumes a square input)
    :param n_colors: number of color channels in the image
    :param n_timewin: number of time windows in the snippet
    :return: a pointer to the output of last layer
    """
    convnets = []
    w_init = None
    # Build 7 parallel CNNs with shared weights
    for i in range(n_timewin):
        if i == 0:
            convnet, w_init = build_cnn(input_vars[i],
                                        imsize=imsize,
                                        n_colors=n_colors)
        else:
            convnet, _ = build_cnn(input_vars[i],
                                   w_init=w_init,
                                   imsize=imsize,
                                   n_colors=n_colors)
        convnets.append(FlattenLayer(convnet))
    # at this point convnets shape is [numTimeWin][n_samples, features]
    # we want the shape to be [n_samples, features, numTimeWin]
    convpool = ConcatLayer(convnets)
    convpool = ReshapeLayer(convpool,
                            ([0], n_timewin, get_output_shape(convnets[0])[1]))

    #print('1.convpool:', convpool.shape) #[0], 3, 2048

    reformConvpool = DimshuffleLayer(convpool, (0, 2, 1))
    #print('1.5. convpool reshape:', reformConvpool.output_shape) #None 2048, 3
    # input to 1D convlayer should be in (batch_size, num_input_channels, input_length)
    conv_out = Conv1DLayer(reformConvpool, 64, 3)
    #print('2. conv_out shape:', conv_out.output_shape) #None, 64, 1
    conv_out = FlattenLayer(conv_out)
    #print('2.5. conv_out shape:', conv_out.output_shape) #None, 64
    # Input to LSTM should have the shape as (batch size, SEQ_LENGTH, num_features)
    lstm = LSTMLayer(convpool,
                     num_units=128,
                     grad_clipping=grad_clip,
                     nonlinearity=lasagne.nonlinearities.tanh)
    #print('3 lstm:', lstm.output_shape) #None, 3, 128
    lstm_out = SliceLayer(lstm, -1, 1)
    #print('3.5 lstmout:', lstm_out.output_shape) #None, 128
    # Merge 1D-Conv and LSTM outputs
    dense_input = ConcatLayer([conv_out, lstm_out])  #None, 192
    #print('4 dense:', dense_input.output_shape)
    # A fully-connected layer of 256 units with 50% dropout on its inputs:
    convpool = DenseLayer(lasagne.layers.dropout(dense_input, p=.5),
                          num_units=512,
                          nonlinearity=lasagne.nonlinearities.rectify)
    # And, finally, the 10-unit output layer with 50% dropout on its inputs:
    convpool = DenseLayer(convpool,
                          num_units=nb_classes,
                          nonlinearity=lasagne.nonlinearities.softmax)
    return convpool
Beispiel #23
0
def create_network(available_actions_num):
    # Creates the input variables
    s1 = tensor.tensor4("States")
    a = tensor.vector("Actions", dtype="int32")
    q2 = tensor.vector("Next State best Q-Value")
    r = tensor.vector("Rewards")
    nonterminal = tensor.vector("Nonterminal", dtype="int8")

    # Creates the input layer of the network.
    dqn = InputLayer(shape=[None, 1, downsampled_y, downsampled_x],
                     input_var=s1)

    # Adds 3 convolutional layers, each followed by a max pooling layer.
    dqn = Conv2DLayer(dqn,
                      num_filters=32,
                      filter_size=[8, 8],
                      nonlinearity=rectify,
                      W=GlorotUniform("relu"),
                      b=Constant(.1))
    dqn = MaxPool2DLayer(dqn, pool_size=[2, 2])
    dqn = Conv2DLayer(dqn,
                      num_filters=64,
                      filter_size=[4, 4],
                      nonlinearity=rectify,
                      W=GlorotUniform("relu"),
                      b=Constant(.1))

    dqn = MaxPool2DLayer(dqn, pool_size=[2, 2])
    dqn = Conv2DLayer(dqn,
                      num_filters=64,
                      filter_size=[3, 3],
                      nonlinearity=rectify,
                      W=GlorotUniform("relu"),
                      b=Constant(.1))
    dqn = MaxPool2DLayer(dqn, pool_size=[2, 2])
    # Adds a single fully connected layer.
    dqn = DenseLayer(dqn,
                     num_units=512,
                     nonlinearity=rectify,
                     W=GlorotUniform("relu"),
                     b=Constant(.1))

    # Adds a single fully connected layer which is the output layer.
    # (no nonlinearity as it is for approximating an arbitrary real function)
    dqn = DenseLayer(dqn, num_units=available_actions_num, nonlinearity=None)

    # Theano stuff
    q = get_output(dqn)
    # Only q for the chosen actions is updated more or less according to following formula:
    # target Q(s,a,t) = r + gamma * max Q(s2,_,t+1)
    target_q = tensor.set_subtensor(q[tensor.arange(q.shape[0]), a],
                                    r + discount_factor * nonterminal * q2)
    loss = squared_error(q, target_q).mean()

    # Updates the parameters according to the computed gradient using rmsprop.
    params = get_all_params(dqn, trainable=True)
    updates = rmsprop(loss, params, learning_rate)

    # Compiles theano functions
    print "Compiling the network ..."
    function_learn = theano.function([s1, q2, a, r, nonterminal],
                                     loss,
                                     updates=updates,
                                     name="learn_fn")
    function_get_q_values = theano.function([s1], q, name="eval_fn")
    function_get_best_action = theano.function([s1],
                                               tensor.argmax(q),
                                               name="test_fn")
    print "Network compiled."

    # Returns Theano objects for the net and functions.
    # We wouldn't need the net anymore but it is nice to save your model.
    return dqn, function_learn, function_get_q_values, function_get_best_action
def lasagne_separate2(M,
                      P,
                      FE,
                      W1,
                      W2,
                      z1,
                      z2,
                      hh=.0001,
                      ep=5000,
                      d=0,
                      wsp=.0001,
                      plt=True):
    from paris.signal import bss_eval

    # Gt dictionary shapes
    K = [W1.shape[0], W2.shape[0]]

    # GPU cached data
    _M = theano.shared(M.T.astype(float32))
    dum = Th.vector('dum')

    # We have weights to discover
    H = theano.shared(random.rand(M.T.shape[0], K[0] + K[1]).astype(float32))
    fI = InputLayer(shape=(M.T.shape[0], K[0] + K[1]), input_var=H)

    # Split in two pathways
    fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1)
    fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1)

    # Dropout?
    dfW1 = DropoutLayer(fW1, d)
    dfW2 = DropoutLayer(fW2, d)

    # Compute source modulators using previously learned dictionaries
    R1 = DenseLayer(dfW1,
                    num_units=M.shape[0],
                    W=W1.astype(float32),
                    nonlinearity=lambda x: psoftplus(x, 3.),
                    b=None)
    R2 = DenseLayer(dfW2,
                    num_units=M.shape[0],
                    W=W2.astype(float32),
                    nonlinearity=lambda x: psoftplus(x, 3.),
                    b=None)

    # Add the two approximations
    R = ElemwiseSumLayer([R1, R2])

    # Cost function
    cost = (_M*(Th.log(_M+eps) - Th.log( get_output( R)+eps)) - _M + get_output( R)).mean() \
       + wsp*Th.mean( H) + 0*Th.mean( dum)

    # Train it using Lasagne
    opt = downhill.build('rprop', loss=cost, inputs=[dum], params=[H])
    train = downhill.Dataset(array([0]).astype(float32), batch_size=0)
    er = downhill_train(opt, train, hh, ep, None)[-1]

    # Get outputs
    _r = nget(R, dum, array([0]).astype(float32)) + eps
    _r1 = nget(R1, dum, array([0]).astype(float32))
    _r2 = nget(R2, dum, array([0]).astype(float32))
    o1 = FE.ife(_r1 * (M / _r), P)
    o2 = FE.ife(_r2 * (M / _r), P)
    sxr = bss_eval(o1, 0, vstack((z1, z2))) + bss_eval(o2, 1, vstack((z1, z2)))

    return o1, o2, (array(sxr[:3]) + array(sxr[3:])) / 2.
Beispiel #25
0
    def __init__(self,
                 controller,
                 num_shifts=3,
                 memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1.,
                                                                high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0.,
                                                               high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.
                 rectify(x),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(Head, self).__init__(controller, **kwargs)

        self.memory_shape = memory_shape
        self.basename = kwargs.get('name', 'head')
        self.learn_init = learn_init

        if W_hid_to_sign is not None:
            self.sign = DenseLayer(controller,
                                   num_units=self.memory_shape[1],
                                   W=W_hid_to_sign,
                                   b=b_hid_to_sign,
                                   nonlinearity=nonlinearity_sign,
                                   name=self.basename + '.sign')
            self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b
        else:
            self.sign = None
            self.W_hid_to_sign, self.b_hid_to_sign = None, None

        self.key = DenseLayer(controller,
                              num_units=self.memory_shape[1],
                              W=W_hid_to_key,
                              b=b_hid_to_key,
                              nonlinearity=nonlinearity_key,
                              name=self.basename + '.key')
        self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b

        self.beta = DenseLayer(controller,
                               num_units=1,
                               W=W_hid_to_beta,
                               b=b_hid_to_beta,
                               nonlinearity=nonlinearity_beta,
                               name=self.basename + '.beta')
        self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b

        self.gate = DenseLayer(controller,
                               num_units=1,
                               W=W_hid_to_gate,
                               b=b_hid_to_gate,
                               nonlinearity=nonlinearity_gate,
                               name=self.basename + '.gate')
        self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b

        self.num_shifts = num_shifts
        self.shift = DenseLayer(controller,
                                num_units=num_shifts,
                                W=W_hid_to_shift,
                                b=b_hid_to_shift,
                                nonlinearity=nonlinearity_shift,
                                name=self.basename + '.shift')
        self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b

        self.gamma = DenseLayer(controller,
                                num_units=1,
                                W=W_hid_to_gamma,
                                b=b_hid_to_gamma,
                                nonlinearity=nonlinearity_gamma,
                                name=self.basename + '.gamma')
        self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b

        self.weights_init = self.add_param(weights_init,
                                           (1, self.memory_shape[0]),
                                           name='weights_init',
                                           trainable=learn_init,
                                           regularizable=False)
Beispiel #26
0
def build_cnn(input_var=None, n=5):

    # create a residual learning building block with two stacked 3x3 convlayers as in paper
    def residual_block(l, increase_dim=False, projection=False):
        input_num_filters = l.output_shape[1]
        if increase_dim:
            first_stride = (2, 2)
            out_num_filters = input_num_filters * 2
        else:
            first_stride = (1, 1)
            out_num_filters = input_num_filters

        #print(l.output_shape)
        l_l = DenseLayer(l,
                         num_units=l.output_shape[3],
                         num_leading_axes=-1,
                         nonlinearity=None)
        #print(l.output_shape[3])
        #print("l_1.output_shape", l_l.output_shape)
        #stride=first_stride
        stack_left_1 = batch_norm(
            ConvLayer(l_l,
                      num_filters=out_num_filters,
                      filter_size=(3, 3),
                      stride=first_stride,
                      nonlinearity=rectify,
                      pad='same',
                      W=lasagne.init.HeNormal(gain='relu'),
                      flip_filters=False))
        stack_left_2 = batch_norm(
            ConvLayer(stack_left_1,
                      num_filters=out_num_filters,
                      filter_size=(3, 3),
                      stride=(1, 1),
                      nonlinearity=None,
                      pad='same',
                      W=lasagne.init.HeNormal(gain='relu'),
                      flip_filters=False))

        #stack_right_1 = batch_norm(ConvLayer(ElemwiseSumLayer([l, NegativeLayer(l_l)]), num_filters=out_num_filters, filter_size=(2,2), stride=first_stride, nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False))
        #stack_right_2 = batch_norm(ConvLayer(stack_right_1, num_filters=out_num_filters, filter_size=(2,2), stride=(1,1), nonlinearity=None, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False))
        print("first stack: ", stack_left_2.output_shape)

        # add shortcut connections
        if increase_dim:
            if projection:
                # projection shortcut, as option B in paper
                projection = batch_norm(
                    ConvLayer(l,
                              num_filters=out_num_filters,
                              filter_size=(1, 1),
                              stride=(2, 2),
                              nonlinearity=None,
                              pad='same',
                              b=None,
                              flip_filters=False))
                print("projection shape: ", projection.output_shape)
                ##block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, projection]),nonlinearity=rectify)
                block = NonlinearityLayer(ElemwiseSumLayer(
                    [stack_left_2, projection]),
                                          nonlinearity=rectify)
            else:
                # identity shortcut, as option A in paper
                #print(l.output_shape[2])
                if (l.output_shape[2] % 2 == 0 and l.output_shape[3] % 2 == 0):
                    identity = ExpressionLayer(
                        l, lambda X: X[:, :, ::2, ::2], lambda s:
                        (s[0], s[1], s[2] // 2, s[3] // 2))
                elif (l.output_shape[2] % 2 == 0
                      and l.output_shape[3] % 2 == 1):
                    identity = ExpressionLayer(
                        l, lambda X: X[:, :, ::2, ::2], lambda s:
                        (s[0], s[1], s[2] // 2, s[3] // 2 + 1))
                elif (l.output_shape[2] % 2 == 1
                      and l.output_shape[3] % 2 == 0):
                    identity = ExpressionLayer(
                        l, lambda X: X[:, :, ::2, ::2], lambda s:
                        (s[0], s[1], s[2] // 2 + 1, s[3] // 2))
                else:
                    identity = ExpressionLayer(
                        l, lambda X: X[:, :, ::2, ::2], lambda s:
                        (s[0], s[1], s[2] // 2 + 1, s[3] // 2 + 1))
                padding = PadLayer(identity,
                                   [(int)(out_num_filters / 4), 0, 0],
                                   batch_ndim=1)
                print('------------------')
                print(stack_left_2.output_shape)
                #print(stack_right_2.output_shape)
                print(identity.output_shape)
                print(padding.output_shape)
                #block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, padding]),nonlinearity=rectify)
                block = NonlinearityLayer(ElemwiseSumLayer(
                    [stack_left_2, padding]),
                                          nonlinearity=rectify)
        else:
            #block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, l]),nonlinearity=rectify)
            print("l output shape: ", l.output_shape)
            block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, l]),
                                      nonlinearity=rectify)

        return block

    # Building the network
    l_in = InputLayer(shape=(None, 16, 512, 660), input_var=input_var)

    # first layer, output is 16 x 32 x 32
    l = batch_norm(
        ConvLayer(l_in,
                  num_filters=16,
                  filter_size=(3, 3),
                  stride=(4, 4),
                  nonlinearity=rectify,
                  pad='same',
                  W=lasagne.init.HeNormal(gain='relu'),
                  flip_filters=False))
    print(l.output_shape)
    # first stack of residual blocks, output is 16 x 32 x 32
    for _ in range(n):
        l = residual_block(l)
        #l = DropoutLayer(l, p = 0.7)
        #print(l.output_shape)
        #print(l.output_shape)
    l = residual_block(l, increase_dim=True)
    #l = DropoutLayer(l, p = 0.5)
    for _ in range(n):
        l = residual_block(l)
        #l = DropoutLayer(l, p = 0.5)
    print(l.output_shape)

    l = batch_norm(
        ConvLayer(l,
                  num_filters=32,
                  filter_size=(3, 3),
                  stride=(2, 2),
                  nonlinearity=rectify,
                  pad='same',
                  W=lasagne.init.HeNormal(gain='relu'),
                  flip_filters=False))
    #l = residual_block(l, increase_dim=True)
    #for _ in range(n):
    #	l = residual_block(l)
    #print(l.output_shape)
    #second stack of residual blocks, output is 32 x 16 x 16
    #l = batch_norm(ConvLayer(l, num_filters = 64, filter_size=(3,3), stride=(2,2), nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False))
    #l = residual_block(l, increase_dim=True)
    #for _ in range(n):
    #    l = residual_block(l)
    #print(l.output_shape)
    """
	# third stack of residual blocks, output is 64 x 8 x 8
	l = residual_block(l, increase_dim=True)
	for _ in range(1,n):
		l = residual_block(l)
	"""
    # average pooling
    l = GlobalPoolLayer(l)
    print("before dense: ", l.output_shape)
    # fully connected layer
    network = DenseLayer(l,
                         num_units=1,
                         W=lasagne.init.HeNormal(),
                         nonlinearity=sigmoid)

    return network
Beispiel #27
0
    helper.set_all_param_values(l_arc, params[:2])

    return embedding_fn


worker = OmniglotOS(image_size=32, batch_size=1024)

X_test, y_test = worker.fetch_batch('test')

for glimpses in range(1, 9):
    embedding_fn = create_embedder_fn(glimpses)

    X = T.matrix("embedding")
    y = T.imatrix("target")
    l_in = InputLayer(shape=(None, 512), input_var=X)
    l_y = DenseLayer(l_in, 1, nonlinearity=sigmoid)
    prediction = get_output(l_y)
    loss = T.mean(binary_crossentropy(prediction, y))
    accuracy = T.mean(binary_accuracy(prediction, y))
    params = get_all_params(l_y)
    updates = adam(loss, params, learning_rate=1e-3)
    train_fn = theano.function([X, y], outputs=loss, updates=updates)
    val_fn = theano.function([X, y], outputs=[loss, accuracy])

    for i in range(250):
        X_train, y_train = worker.fetch_batch('train')
        train_fn(embedding_fn(X_train), y_train)

    X_train, y_train = worker.fetch_batch('train')
    train_loss = train_fn(embedding_fn(X_train), y_train)
    val_loss, val_acc = val_fn(embedding_fn(X_test), y_test)
Beispiel #28
0
    def residual_block(l, increase_dim=False, projection=False):
        input_num_filters = l.output_shape[1]
        if increase_dim:
            first_stride = (2, 2)
            out_num_filters = input_num_filters * 2
        else:
            first_stride = (1, 1)
            out_num_filters = input_num_filters

        #print(l.output_shape)
        l_l = DenseLayer(l,
                         num_units=l.output_shape[3],
                         num_leading_axes=-1,
                         nonlinearity=None)
        #print(l.output_shape[3])
        #print("l_1.output_shape", l_l.output_shape)
        #stride=first_stride
        stack_left_1 = batch_norm(
            ConvLayer(l_l,
                      num_filters=out_num_filters,
                      filter_size=(3, 3),
                      stride=first_stride,
                      nonlinearity=rectify,
                      pad='same',
                      W=lasagne.init.HeNormal(gain='relu'),
                      flip_filters=False))
        stack_left_2 = batch_norm(
            ConvLayer(stack_left_1,
                      num_filters=out_num_filters,
                      filter_size=(3, 3),
                      stride=(1, 1),
                      nonlinearity=None,
                      pad='same',
                      W=lasagne.init.HeNormal(gain='relu'),
                      flip_filters=False))

        #stack_right_1 = batch_norm(ConvLayer(ElemwiseSumLayer([l, NegativeLayer(l_l)]), num_filters=out_num_filters, filter_size=(2,2), stride=first_stride, nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False))
        #stack_right_2 = batch_norm(ConvLayer(stack_right_1, num_filters=out_num_filters, filter_size=(2,2), stride=(1,1), nonlinearity=None, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False))
        print("first stack: ", stack_left_2.output_shape)

        # add shortcut connections
        if increase_dim:
            if projection:
                # projection shortcut, as option B in paper
                projection = batch_norm(
                    ConvLayer(l,
                              num_filters=out_num_filters,
                              filter_size=(1, 1),
                              stride=(2, 2),
                              nonlinearity=None,
                              pad='same',
                              b=None,
                              flip_filters=False))
                print("projection shape: ", projection.output_shape)
                ##block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, projection]),nonlinearity=rectify)
                block = NonlinearityLayer(ElemwiseSumLayer(
                    [stack_left_2, projection]),
                                          nonlinearity=rectify)
            else:
                # identity shortcut, as option A in paper
                #print(l.output_shape[2])
                if (l.output_shape[2] % 2 == 0 and l.output_shape[3] % 2 == 0):
                    identity = ExpressionLayer(
                        l, lambda X: X[:, :, ::2, ::2], lambda s:
                        (s[0], s[1], s[2] // 2, s[3] // 2))
                elif (l.output_shape[2] % 2 == 0
                      and l.output_shape[3] % 2 == 1):
                    identity = ExpressionLayer(
                        l, lambda X: X[:, :, ::2, ::2], lambda s:
                        (s[0], s[1], s[2] // 2, s[3] // 2 + 1))
                elif (l.output_shape[2] % 2 == 1
                      and l.output_shape[3] % 2 == 0):
                    identity = ExpressionLayer(
                        l, lambda X: X[:, :, ::2, ::2], lambda s:
                        (s[0], s[1], s[2] // 2 + 1, s[3] // 2))
                else:
                    identity = ExpressionLayer(
                        l, lambda X: X[:, :, ::2, ::2], lambda s:
                        (s[0], s[1], s[2] // 2 + 1, s[3] // 2 + 1))
                padding = PadLayer(identity,
                                   [(int)(out_num_filters / 4), 0, 0],
                                   batch_ndim=1)
                print('------------------')
                print(stack_left_2.output_shape)
                #print(stack_right_2.output_shape)
                print(identity.output_shape)
                print(padding.output_shape)
                #block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, padding]),nonlinearity=rectify)
                block = NonlinearityLayer(ElemwiseSumLayer(
                    [stack_left_2, padding]),
                                          nonlinearity=rectify)
        else:
            #block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, l]),nonlinearity=rectify)
            print("l output shape: ", l.output_shape)
            block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, l]),
                                      nonlinearity=rectify)

        return block
Beispiel #29
0
def regr(X, y):
    l = InputLayer(shape=(None, X.shape[1]))
    l = DenseLayer(l, num_units=y.shape[1], nonlinearity=None)
    net = NeuralNet(l, regression=True, update_learning_rate=0.01)
    net.fit(X, y)
    print(net.score(X, y))
Beispiel #30
0
def build_network():
    net = {}

    net['input'] = InputLayer((None, 3, 299, 299))
    net['conv'] = bn_conv(net['input'],
                          num_filters=32, filter_size=3, stride=2)
    net['conv_1'] = bn_conv(net['conv'], num_filters=32, filter_size=3)
    net['conv_2'] = bn_conv(net['conv_1'],
                            num_filters=64, filter_size=3, pad=1)
    net['pool'] = Pool2DLayer(net['conv_2'], pool_size=3, stride=2, mode='max')

    net['conv_3'] = bn_conv(net['pool'], num_filters=80, filter_size=1)

    net['conv_4'] = bn_conv(net['conv_3'], num_filters=192, filter_size=3)

    net['pool_1'] = Pool2DLayer(net['conv_4'],
                                pool_size=3, stride=2, mode='max')
    net['mixed/join'] = inceptionA(
        net['pool_1'], nfilt=((64,), (48, 64), (64, 96, 96), (32,)))
    net['mixed_1/join'] = inceptionA(
        net['mixed/join'], nfilt=((64,), (48, 64), (64, 96, 96), (64,)))

    net['mixed_2/join'] = inceptionA(
        net['mixed_1/join'], nfilt=((64,), (48, 64), (64, 96, 96), (64,)))

    net['mixed_3/join'] = inceptionB(
        net['mixed_2/join'], nfilt=((384,), (64, 96, 96)))

    net['mixed_4/join'] = inceptionC(
        net['mixed_3/join'],
        nfilt=((192,), (128, 128, 192), (128, 128, 128, 128, 192), (192,)))

    net['mixed_5/join'] = inceptionC(
        net['mixed_4/join'],
        nfilt=((192,), (160, 160, 192), (160, 160, 160, 160, 192), (192,)))

    net['mixed_6/join'] = inceptionC(
        net['mixed_5/join'],
        nfilt=((192,), (160, 160, 192), (160, 160, 160, 160, 192), (192,)))

    net['mixed_7/join'] = inceptionC(
        net['mixed_6/join'],
        nfilt=((192,), (192, 192, 192), (192, 192, 192, 192, 192), (192,)))

    net['mixed_8/join'] = inceptionD(
        net['mixed_7/join'],
        nfilt=((192, 320), (192, 192, 192, 192)))

    net['mixed_9/join'] = inceptionE(
        net['mixed_8/join'],
        nfilt=((320,), (384, 384, 384), (448, 384, 384, 384), (192,)),
        pool_mode='average_exc_pad')

    net['mixed_10/join'] = inceptionE(
        net['mixed_9/join'],
        nfilt=((320,), (384, 384, 384), (448, 384, 384, 384), (192,)),
        pool_mode='max')

    net['pool3'] = GlobalPoolLayer(net['mixed_10/join'])

    net['softmax'] = DenseLayer(
        net['pool3'], num_units=1008, nonlinearity=softmax)

    return net
class ShallowNeuralForest:
    def __init__(self, n_inputs, n_outputs, regression, multiclass=False, depth=5, n_estimators=20, n_hidden=128, learning_rate=0.01, num_epochs=500, pi_iters=20, sgd_iters=10, batch_size=1000, momentum=0.0, dropout=0.0, loss=None, update=adagrad):
        """
        Parameters
        ----------
        n_inputs : number of input features
        n_outputs : number of classes to predict (1 for regression)
            for 2 class classification n_outputs should be 2, not 1
        regression : True for regression, False for classification
        multiclass : not used
        depth : depth of each tree in the ensemble
        n_estimators : number of trees in the ensemble
        n_hidden : number of neurons in the hidden layer
        pi_iters : number of iterations for the iterative algorithm that updates pi
        sgd_iters : number of full iterations of sgd between two consequtive updates of pi
        loss : theano loss function. If None, squared error will be used for regression and
            cross entropy will be used for classification
        update : theano update function
        """
        self._depth = depth
        self._n_estimators = n_estimators
        self._n_hidden = n_hidden
        self._n_outputs = n_outputs
        self._loss = loss
        self._regression = regression
        self._multiclass = multiclass
        self._learning_rate = learning_rate
        self._num_epochs = num_epochs
        self._pi_iters = pi_iters
        self._sgd_iters = sgd_iters
        self._batch_size = batch_size
        self._momentum = momentum
        self._update = update

        self.t_input = T.matrix('input')
        self.t_label = T.matrix('output')

        self._cached_trainable_params = None
        self._cached_params = None

        self._n_net_out = n_estimators * ((1 << depth) - 1)

        self.l_input = InputLayer((None, n_inputs))
        self.l_dense1 = DenseLayer(self.l_input, self._n_hidden, nonlinearity=rectify)
        if dropout != 0:
            self.l_dense1 = DropoutLayer(self.l_dense1, p=dropout)
        if not __DEBUG_NO_FOREST__:
            self.l_dense2 = DenseLayer(self.l_dense1, self._n_net_out, nonlinearity=sigmoid)
            self.l_forest = NeuralForestLayer(self.l_dense2, self._depth, self._n_estimators, self._n_outputs, self._pi_iters)
        else:
            self.l_forest = DenseLayer(self.l_dense1, self._n_outputs, nonlinearity=softmax)

    def _create_functions(self):
        self._update_func = self._update(self._get_loss_function(), self._get_all_trainable_params(), self._learning_rate)
        if self._momentum != 0:
            self._update_func = apply_nesterov_momentum(self._update_func, self._get_all_trainable_params(), self._momentum)
        self._loss_func = self._get_loss_function()
        self._train_function = theano.function([self.t_input, self.t_label], self._get_loss_function(), updates=self._update_func)

    def fit(self, X, y, X_val = None, y_val = None, on_epoch = None, verbose = False):
        """ Train the model

        Parameters
        ----------
        X : input vector for the training set
        y : output vector for the training set. Onehot is required for classification
        X_val : if not None, input vector for the validation set
        y_val : it not None, input vector for the validation set
        on_epoch : a callback that is called after each epoch
            if X_val is None, the signature is (epoch, training_error, accuracy)
            if X_val is not None, the signature is (epoch, training_error, validation_error, accuracy)
            on iterations that update pi the training error is reported for the previous iteration
        verbose : if True, spams current step on each epoch
        """
        self._create_functions()

        X = X.astype(np.float32)
        y = y.astype(np.float32)
        self._x_mean = np.mean(X, axis=0)
        self._x_std = np.std(X, axis=0)
        self._x_std[self._x_std == 0] = 1
        X = (X - self._x_mean) / self._x_std
        if y_val is not None:
            assert X_val is not None
            X_val = X_val.astype(np.float32)
            y_val = y_val.astype(np.float32)
            X_val = (X_val - self._x_mean) / self._x_std

        if X_val is not None:
            assert y_val is not None

            predictions = self._predict_internal(self._get_output())
            accuracy = T.mean(T.eq(predictions, self._predict_internal(self.t_label)))

            test_function = theano.function([self.t_input, self.t_label], [self._get_loss_function(), accuracy])

        iterator = BatchIterator(self._batch_size)

        loss = 0
        for epoch in range(self._num_epochs):

            # update the values of pi
            if not __DEBUG_NO_FOREST__ and epoch % self._sgd_iters == 0:
                if verbose: print "updating pi"
                self.l_forest.update_pi(X, y)
                if verbose: print "recreating update funcs"
                self._create_functions()

            else:
                if verbose: print "updating theta"
                loss = 0
                deno = 0
                # update the network parameters
                for Xb, yb in iterator(X, y):
                    loss += self._train_function(Xb, yb)
                    deno += 1

                loss /= deno

            if X_val is not None:
                tloss = 0
                accur = 0
                deno = 0
                iterator = BatchIterator(self._batch_size)
                for Xb, yb in iterator(X_val, y_val):
                    tl, ac = test_function(Xb, yb)
                    tloss += tl
                    accur += ac
                    deno += 1
                tloss /= deno
                accur /= deno

            if on_epoch is not None:
                if X_val is None:
                    on_epoch(epoch, loss)
                else:
                    on_epoch(epoch, loss, tloss, accur)

        return self

    def _predict_internal(self, y):
        if not self._regression and not self._multiclass:
            return y.argmax(axis=1)
        else:
            return y >= 0.5

    def predict(self, X):
        ret = self.predict_proba(X)
        return self._predict_internal(ret)

    def predict_proba(self, X):
        X = X.astype(np.float32)
        X = (X - self._x_mean) / self._x_std
        predict_function = theano.function([self.t_input], self._get_output())
        return predict_function(X)

    def _get_loss_function(self):
        # TODO: remove `or True`
        if self._loss is None:
            if self._regression:
                self._loss = squared_error
            else:
                self._loss = categorical_crossentropy
        return aggregate(self._loss(self._get_output(), self.t_label), mode='mean')

    def _get_output(self):
        return get_output(self.l_forest, self.t_input)

    def _get_all_trainable_params(self):
        if self._cached_trainable_params is None:
            self._cached_trainable_params = get_all_params(self.l_forest, trainable=True)
        return self._cached_trainable_params
    
    def _get_all_params(self):
        if self._cached_params is None:
            self._cached_params = get_all_params(self.l_forest)
        return self._cached_params
Beispiel #32
0
def build_model():
    net = {}
    net['input'] = InputLayer((None, 3, 224, 224))
    sub_net, parent_layer_name = build_simple_block(
        net['input'], ['conv1', 'bn_conv1', 'conv1_relu'],
        64,
        7,
        3,
        2,
        use_bias=True)
    net.update(sub_net)
    net['pool1'] = PoolLayer(net[parent_layer_name],
                             pool_size=3,
                             stride=2,
                             pad=0,
                             mode='max',
                             ignore_border=False)
    block_size = list('abc')
    parent_layer_name = 'pool1'
    for c in block_size:
        if c == 'a':
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name], 1, 1, True, 4, ix='2%s' % c)
        else:
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='2%s' % c)
        net.update(sub_net)

    block_size = list('abcd')
    for c in block_size:
        if c == 'a':
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name],
                1.0 / 2,
                1.0 / 2,
                True,
                4,
                ix='3%s' % c)
        else:
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='3%s' % c)
        net.update(sub_net)

    block_size = list('abcdef')
    for c in block_size:
        if c == 'a':
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name],
                1.0 / 2,
                1.0 / 2,
                True,
                4,
                ix='4%s' % c)
        else:
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='4%s' % c)
        net.update(sub_net)

    block_size = list('abc')
    for c in block_size:
        if c == 'a':
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name],
                1.0 / 2,
                1.0 / 2,
                True,
                4,
                ix='5%s' % c)
        else:
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='5%s' % c)
        net.update(sub_net)
    net['pool5'] = PoolLayer(net[parent_layer_name],
                             pool_size=7,
                             stride=1,
                             pad=0,
                             mode='average_exc_pad',
                             ignore_border=False)
    net['fc1000'] = DenseLayer(net['pool5'], num_units=1000, nonlinearity=None)
    net['prob'] = NonlinearityLayer(net['fc1000'], nonlinearity=softmax)

    return net
Beispiel #33
0
class Head(Layer):
    r"""
    The base class :class:`Head` represents a generic head for the
    Neural Turing Machine. The heads are responsible for the read/write
    operations on the memory. An instance of :class:`Head` outputs a
    weight vector defined by

    .. math ::
        \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\
        k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\
        \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\
        g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\
        s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\
        \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma})

    .. math ::
        w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\
        w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\
        \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\
        w_{t} \propto \tilde{w}_{t}^{\gamma_{t}}

    Parameters
    ----------
    controller: a :class:`Controller` instance
        The controller of the Neural Turing Machine.
    num_shifts: int
        Number of shifts allowed by the convolutional shift operation
        (centered on 0, eg. ``num_shifts=3`` represents shifts
        in [-1, 0, 1]).
    memory_shape: tuple
        Shape of the NTM's memory
    W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the weights for the parameter
        :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is
        ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape
        ``(controller.num_units, memory_shape[1])``.
    b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the biases for the parameter
        :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix
        with shape ``(memory_shape[1],)``.
    nonlinearity_sign: callable or ``None``
        The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If
        ``None``, the nonlinearity is ``identity``.
    W_hid_to_key: callable, Numpy array or Theano shared variable
    b_hid_to_key: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_key: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`k_{t}`.
    W_hid_to_beta: callable, Numpy array or Theano shared variable
    b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_beta: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\beta_{t}`.
    W_hid_to_gate: callable, Numpy array or Theano shared variable
    b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gate: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`g_{t}`.
    W_hid_to_shift: callable, Numpy array or Theano shared variable
    b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_shift: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`s_{t}`.
    W_hid_to_gamma: callable, Numpy array or Theano shared variable
    b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gamma: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\gamma_{t}`
    weights_init: callable, Numpy array or Theano shared variable
        Initializer for the initial weight vector (:math:`w_{0}`).
    learn_init: bool
        If ``True``, initial hidden values are learned.
    """
    def __init__(self, controller, num_shifts=3, memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(Head, self).__init__(controller, **kwargs)

        self.memory_shape = memory_shape
        self.basename = kwargs.get('name', 'head')
        self.learn_init = learn_init

        if W_hid_to_sign is not None:
            self.sign = DenseLayer(controller, num_units=self.memory_shape[1],
                W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign,
                name=self.basename + '.sign')
            self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b
        else:
            self.sign = None
            self.W_hid_to_sign, self.b_hid_to_sign = None, None

        self.key = DenseLayer(controller, num_units=self.memory_shape[1],
            W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key,
            name=self.basename + '.key')
        self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b
        
        self.beta = DenseLayer(controller, num_units=1,
            W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta,
            name=self.basename + '.beta')
        self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b

        self.gate = DenseLayer(controller, num_units=1,
            W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate,
            name=self.basename + '.gate')
        self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b

        self.num_shifts = num_shifts
        self.shift = DenseLayer(controller, num_units=num_shifts,
            W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift,
            name=self.basename + '.shift')
        self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b

        self.gamma = DenseLayer(controller, num_units=1,
            W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma,
            name=self.basename + '.gamma')
        self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b

        self.weights_init = self.add_param(
            weights_init, (1, self.memory_shape[0]),
            name='weights_init', trainable=learn_init, regularizable=False)

    def get_output_for(self, h_t, w_tm1, M_t, **kwargs):
        if self.sign is not None:
            sign_t = self.sign.get_output_for(h_t, **kwargs)
        else:
            sign_t = 1.
        k_t = self.key.get_output_for(h_t, **kwargs)
        beta_t = self.beta.get_output_for(h_t, **kwargs)
        g_t = self.gate.get_output_for(h_t, **kwargs)
        s_t = self.shift.get_output_for(h_t, **kwargs)
        gamma_t = self.gamma.get_output_for(h_t, **kwargs)

        # Content Adressing (3.3.1)
        beta_t = T.addbroadcast(beta_t, 1)
        betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t)
        w_c = lasagne.nonlinearities.softmax(betaK)

        # Interpolation (3.3.2)
        g_t = T.addbroadcast(g_t, 1)
        w_g = g_t * w_c + (1. - g_t) * w_tm1

        # Convolutional Shift (3.3.2)
        w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1)
        conv_filter = s_t.dimshuffle(0, 'x', 'x', 1)
        pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2)
        w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3)
        convolution = T.nnet.conv2d(w_g_padded, conv_filter,
            input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]),
            filter_shape=(self.input_shape[0], 1, 1, self.num_shifts),
            subsample=(1, 1),
            border_mode='valid')
        w_tilde = convolution[:, 0, 0, :]

        # Sharpening (3.3.2)
        gamma_t = T.addbroadcast(gamma_t, 1)
        w = T.pow(w_tilde + 1e-6, gamma_t)
        w /= T.sum(w)

        return w

    def get_params(self, **tags):
        params = super(Head, self).get_params(**tags)
        if self.sign is not None:
            params += self.sign.get_params(**tags)
        params += self.key.get_params(**tags)
        params += self.beta.get_params(**tags)
        params += self.gate.get_params(**tags)
        params += self.shift.get_params(**tags)
        params += self.gamma.get_params(**tags)

        return params
Beispiel #34
0
def build_convpool_lstm(input_vars,
                        nb_classes,
                        grad_clip=110,
                        imsize=32,
                        n_colors=3,
                        n_timewin=3):
    """
    Builds the complete network with LSTM layer to integrate time from sequences of EEG images.

    :param input_vars: list of EEG images (one image per time window)
    :param nb_classes: number of classes
    :param grad_clip:  the gradient messages are clipped to the given value during
                        the backward pass.
    :param imsize: size of the input image (assumes a square input)
    :param n_colors: number of color channels in the image
    :param n_timewin: number of time windows in the snippet
    :return: a pointer to the output of last layer
    """
    convnets = []
    w_init = None
    # Build 7 parallel CNNs with shared weights
    for i in range(n_timewin):
        if i == 0:
            convnet, w_init = build_cnn(input_vars[i],
                                        imsize=imsize,
                                        n_colors=n_colors)
        else:
            convnet, _ = build_cnn(input_vars[i],
                                   w_init=w_init,
                                   imsize=imsize,
                                   n_colors=n_colors)
        convnets.append(FlattenLayer(convnet))

    #print(convnet.output_shape) #None, 128, 4, 4
    #print('0.:', convnets[0].output_shape) #None, 2048... 128*4*4
    # at this point convnets shape is [numTimeWin][n_samples, features]
    # we want the shape to be [n_samples, features, numTimeWin]
    convpool = ConcatLayer(convnets)
    #print('1.concat:', convpool.output_shape) #None, 6144
    convpool = ReshapeLayer(convpool,
                            ([0], n_timewin, get_output_shape(convnets[0])[1]))
    # Input to LSTM should have the shape as (batch size, SEQ_LENGTH, num_features)
    #print('2.Reshape:', convpool.output_shape) #None, 3, 2048
    convpool = LSTMLayer(convpool,
                         num_units=128,
                         grad_clipping=grad_clip,
                         nonlinearity=lasagne.nonlinearities.tanh)
    # We only need the final prediction, we isolate that quantity and feed it
    # to the next layer.
    #print('3.LSTM:', convpool.output_shape) #None, 3, 128
    convpool = SliceLayer(convpool, -1, 1)  # Selecting the last prediction
    # A fully-connected layer of 256 units with 50% dropout on its inputs:
    #print('4.slice:', convpool.output_shape) #None, 128
    convpool = DenseLayer(lasagne.layers.dropout(convpool, p=.5),
                          num_units=256,
                          nonlinearity=lasagne.nonlinearities.rectify)
    # And, finally, the output layer with 50% dropout on its inputs:
    convpool = DenseLayer(lasagne.layers.dropout(convpool, p=.5),
                          num_units=nb_classes,
                          nonlinearity=lasagne.nonlinearities.softmax)
    return convpool
def build_network():
    """

    Returns
    -------

    """

    input_var = t.tensor4('inputs')
    target = t.matrix('targets')
    lr = t.scalar('lr', dtype=theano.config.floatX)
    poolmode = 'average_exc_pad'
    new_pool_size2 = 3

    net = {'input': InputLayer((None, 3, 299, 299), input_var=input_var)}

    net['conv'] = bn_conv(net['input'],
                          num_filters=32, filter_size=3, stride=2)
    net['conv_1'] = bn_conv(net['conv'], num_filters=32, filter_size=3)
    net['conv_2'] = bn_conv(net['conv_1'],
                            num_filters=64, filter_size=3, pad=1)
    net['pool'] = Pool2DLayer(net['conv_2'], pool_size=new_pool_size2, stride=2,
                              mode=poolmode)

    net['conv_3'] = bn_conv(net['pool'], num_filters=80, filter_size=1)

    net['conv_4'] = bn_conv(net['conv_3'], num_filters=192, filter_size=3)

    net['pool_1'] = Pool2DLayer(net['conv_4'],
                                pool_size=new_pool_size2, stride=2, mode=poolmode)
    net['mixed/join'] = inception_a(
        net['pool_1'], nfilt=((64,), (48, 64), (64, 96, 96), (32,)))
    net['mixed_1/join'] = inception_a(
        net['mixed/join'], nfilt=((64,), (48, 64), (64, 96, 96), (64,)))

    net['mixed_2/join'] = inception_a(
        net['mixed_1/join'], nfilt=((64,), (48, 64), (64, 96, 96), (64,)))

    net['mixed_3/join'] = inception_b(
        net['mixed_2/join'], nfilt=((384,), (64, 96, 96)))

    net['mixed_4/join'] = inception_c(
        net['mixed_3/join'],
        nfilt=((192,), (128, 128, 192), (128, 128, 128, 128, 192), (192,)))

    net['mixed_5/join'] = inception_c(
        net['mixed_4/join'],
        nfilt=((192,), (160, 160, 192), (160, 160, 160, 160, 192), (192,)))

    net['mixed_6/join'] = inception_c(
        net['mixed_5/join'],
        nfilt=((192,), (160, 160, 192), (160, 160, 160, 160, 192), (192,)))

    net['mixed_7/join'] = inception_c(
        net['mixed_6/join'],
        nfilt=((192,), (192, 192, 192), (192, 192, 192, 192, 192), (192,)))

    net['mixed_8/join'] = inception_d(
        net['mixed_7/join'],
        nfilt=((192, 320), (192, 192, 192, 192)))

    net['mixed_9/join'] = inception_e(
        net['mixed_8/join'],
        nfilt=((320,), (384, 384, 384), (448, 384, 384, 384), (192,)),
        pool_mode='average_exc_pad')

    net['mixed_10/join'] = inception_e(
        net['mixed_9/join'],
        nfilt=((320,), (384, 384, 384), (448, 384, 384, 384), (192,)),
        pool_mode=poolmode)

    net['pool3'] = GlobalPoolLayer(net['mixed_10/join'])

    net['softmax'] = DenseLayer(
        net['pool3'], num_units=1008, nonlinearity=softmax)

    train_output = lasagne.layers.get_output(net['softmax'],
                                             deterministic=False)
    train_loss = lasagne.objectives.categorical_crossentropy(train_output,
                                                             target)
    train_loss = lasagne.objectives.aggregate(train_loss)
    train_err = t.mean(t.neq(t.argmax(train_output, axis=1),
                             t.argmax(target, axis=1)),
                       dtype=theano.config.floatX)
    params = lasagne.layers.get_all_params(net['softmax'], trainable=True)
    updates = lasagne.updates.sgd(loss_or_grads=train_loss, params=params,
                                  learning_rate=lr)

    test_output = lasagne.layers.get_output(net['softmax'], deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(test_output, target)
    test_loss = lasagne.objectives.aggregate(test_loss)
    test_err = t.mean(t.neq(t.argmax(test_output, axis=1),
                            t.argmax(target, axis=1)),
                      dtype=theano.config.floatX)

    # Compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target, lr], [train_loss, train_err],
                               updates=updates)

    # Compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target], [test_loss, test_err])

    return {'model': net['softmax'], 'train_fn': train_fn, 'val_fn': val_fn}
Beispiel #36
0
class Head(Layer):
    r"""
    The base class :class:`Head` represents a generic head for the
    Neural Turing Machine. The heads are responsible for the read/write
    operations on the memory. An instance of :class:`Head` outputs a
    weight vector defined by

    .. math ::
        \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\
        k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\
        \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\
        g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\
        s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\
        \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma})

    .. math ::
        w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\
        w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\
        \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\
        w_{t} \propto \tilde{w}_{t}^{\gamma_{t}}

    Parameters
    ----------
    controller: a :class:`Controller` instance
        The controller of the Neural Turing Machine.
    num_shifts: int
        Number of shifts allowed by the convolutional shift operation
        (centered on 0, eg. ``num_shifts=3`` represents shifts
        in [-1, 0, 1]).
    memory_shape: tuple
        Shape of the NTM's memory
    W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the weights for the parameter
        :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is
        ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape
        ``(controller.num_units, memory_shape[1])``.
    b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the biases for the parameter
        :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix
        with shape ``(memory_shape[1],)``.
    nonlinearity_sign: callable or ``None``
        The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If
        ``None``, the nonlinearity is ``identity``.
    W_hid_to_key: callable, Numpy array or Theano shared variable
    b_hid_to_key: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_key: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`k_{t}`.
    W_hid_to_beta: callable, Numpy array or Theano shared variable
    b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_beta: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\beta_{t}`.
    W_hid_to_gate: callable, Numpy array or Theano shared variable
    b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gate: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`g_{t}`.
    W_hid_to_shift: callable, Numpy array or Theano shared variable
    b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_shift: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`s_{t}`.
    W_hid_to_gamma: callable, Numpy array or Theano shared variable
    b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gamma: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\gamma_{t}`
    weights_init: callable, Numpy array or Theano shared variable
        Initializer for the initial weight vector (:math:`w_{0}`).
    learn_init: bool
        If ``True``, initial hidden values are learned.
    """
    def __init__(self,
                 controller,
                 num_shifts=3,
                 memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1.,
                                                                high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0.,
                                                               high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.
                 rectify(x),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(Head, self).__init__(controller, **kwargs)

        self.memory_shape = memory_shape
        self.basename = kwargs.get('name', 'head')
        self.learn_init = learn_init

        if W_hid_to_sign is not None:
            self.sign = DenseLayer(controller,
                                   num_units=self.memory_shape[1],
                                   W=W_hid_to_sign,
                                   b=b_hid_to_sign,
                                   nonlinearity=nonlinearity_sign,
                                   name=self.basename + '.sign')
            self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b
        else:
            self.sign = None
            self.W_hid_to_sign, self.b_hid_to_sign = None, None

        self.key = DenseLayer(controller,
                              num_units=self.memory_shape[1],
                              W=W_hid_to_key,
                              b=b_hid_to_key,
                              nonlinearity=nonlinearity_key,
                              name=self.basename + '.key')
        self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b

        self.beta = DenseLayer(controller,
                               num_units=1,
                               W=W_hid_to_beta,
                               b=b_hid_to_beta,
                               nonlinearity=nonlinearity_beta,
                               name=self.basename + '.beta')
        self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b

        self.gate = DenseLayer(controller,
                               num_units=1,
                               W=W_hid_to_gate,
                               b=b_hid_to_gate,
                               nonlinearity=nonlinearity_gate,
                               name=self.basename + '.gate')
        self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b

        self.num_shifts = num_shifts
        self.shift = DenseLayer(controller,
                                num_units=num_shifts,
                                W=W_hid_to_shift,
                                b=b_hid_to_shift,
                                nonlinearity=nonlinearity_shift,
                                name=self.basename + '.shift')
        self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b

        self.gamma = DenseLayer(controller,
                                num_units=1,
                                W=W_hid_to_gamma,
                                b=b_hid_to_gamma,
                                nonlinearity=nonlinearity_gamma,
                                name=self.basename + '.gamma')
        self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b

        self.weights_init = self.add_param(weights_init,
                                           (1, self.memory_shape[0]),
                                           name='weights_init',
                                           trainable=learn_init,
                                           regularizable=False)

    def get_output_for(self, h_t, w_tm1, M_t, **kwargs):
        if self.sign is not None:
            sign_t = self.sign.get_output_for(h_t, **kwargs)
        else:
            sign_t = 1.
        k_t = self.key.get_output_for(h_t, **kwargs)
        beta_t = self.beta.get_output_for(h_t, **kwargs)
        g_t = self.gate.get_output_for(h_t, **kwargs)
        s_t = self.shift.get_output_for(h_t, **kwargs)
        gamma_t = self.gamma.get_output_for(h_t, **kwargs)

        # Content Adressing (3.3.1)
        beta_t = T.addbroadcast(beta_t, 1)
        betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t)
        w_c = lasagne.nonlinearities.softmax(betaK)

        # Interpolation (3.3.2)
        g_t = T.addbroadcast(g_t, 1)
        w_g = g_t * w_c + (1. - g_t) * w_tm1

        # Convolutional Shift (3.3.2)
        w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1)
        conv_filter = s_t.dimshuffle(0, 'x', 'x', 1)
        pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2)
        w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3)
        convolution = T.nnet.conv2d(
            w_g_padded,
            conv_filter,
            input_shape=(self.input_shape[0], 1, 1,
                         self.memory_shape[0] + pad[0] + pad[1]),
            filter_shape=(self.input_shape[0], 1, 1, self.num_shifts),
            subsample=(1, 1),
            border_mode='valid')
        w_tilde = convolution[:, 0, 0, :]

        # Sharpening (3.3.2)
        gamma_t = T.addbroadcast(gamma_t, 1)
        w = T.pow(w_tilde + 1e-6, gamma_t)
        w /= T.sum(w)

        return w

    def get_params(self, **tags):
        params = super(Head, self).get_params(**tags)
        if self.sign is not None:
            params += self.sign.get_params(**tags)
        params += self.key.get_params(**tags)
        params += self.beta.get_params(**tags)
        params += self.gate.get_params(**tags)
        params += self.shift.get_params(**tags)
        params += self.gamma.get_params(**tags)

        return params
Beispiel #37
0
def train(images,
          labels,
          fold,
          model_type,
          mark=0,
          batch_size=32,
          num_epochs=5):
    """
    A sample training function which loops over the training set and evaluates the network
    on the validation set after each epoch. Evaluates the network on the training set
    whenever the
    :param images: input images
    :param labels: target labels
    :param fold: tuple of (train, test) index numbers
    :param model_type: model type ('cnn', '1dconv', 'maxpool', 'lstm', 'mix')
    :param batch_size: batch size for training
    :param num_epochs: number of epochs of dataset to go over for training
    :return: none
    """
    #train(images, np.squeeze(feats[:, -1]) - 1, fold_pairs[2], 'cnn')
    num_classes = len(np.unique(labels))
    (X_train,
     y_train), (X_val, y_val), (X_test,
                                y_test) = reformatInput(images, labels, fold)
    X_train = X_train.astype("float32", casting='unsafe')
    X_val = X_val.astype("float32", casting='unsafe')
    X_test = X_test.astype("float32", casting='unsafe')
    # Prepare Theano variables for inputs and targets
    input_var = T.TensorType('floatX', ((False, ) * 5))()
    target_var = T.ivector('targets')

    print(X_train.shape)

    # Create neural network model (depending on first command line parameter)
    print("Building model and compiling functions...")
    # Building the appropriate model
    if model_type == '1dconv':
        network = build_convpool_conv1d(input_var, num_classes)
    elif model_type == 'maxpool':
        network = build_convpool_max(input_var, num_classes)
    elif model_type == 'lstm':
        network = build_convpool_lstm(input_var, num_classes, 100)
    elif model_type == 'mix':
        network = build_convpool_mix(input_var, num_classes, 100)
    elif model_type == 'cnn':
        input_var = T.tensor4('inputs')
        network, _ = build_cnn(input_var)
        network = DenseLayer(lasagne.layers.dropout(network, p=.5),
                             num_units=256,
                             nonlinearity=lasagne.nonlinearities.rectify)
        network = DenseLayer(lasagne.layers.dropout(network, p=.5),
                             num_units=num_classes,
                             nonlinearity=lasagne.nonlinearities.softmax)
    else:
        raise ValueError(
            "Model not supported ['1dconv', 'maxpool', 'lstm', 'mix', 'cnn']")
    # Create a loss expression for training, i.e., a scalar objective we want
    # to minimize (for our multi-class problem, it is the cross-entropy loss):
    # print("network test")
    # print(network.shape)

    prediction = lasagne.layers.get_output(network)
    loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
    loss = loss.mean()
    params = lasagne.layers.get_all_params(network, trainable=True)
    updates = lasagne.updates.adam(loss, params, learning_rate=0.001)
    # Create a loss expression for validation/testing. The crucial difference
    # here is that we do a deterministic forward pass through the network,
    # disabling dropout layers.
    test_prediction = lasagne.layers.get_output(network, deterministic=True)
    test_loss = lasagne.objectives.categorical_crossentropy(
        test_prediction, target_var)
    test_loss = test_loss.mean()
    # as a bonus, also create an expression for the classification accuracy:
    test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
                      dtype=theano.config.floatX)
    # compile a function performing a training step on a mini-batch (by giving
    # the updates dictionary) and returning the corresponding training loss:
    train_fn = theano.function([input_var, target_var], loss, updates=updates)
    # compile a second function computing the validation loss and accuracy:
    val_fn = theano.function([input_var, target_var], [test_loss, test_acc])
    # Finally, launch the training loop.
    print("Starting training...")
    best_validation_accu = 0
    # We iterate over epochs:
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
        for batch in iterate_minibatches(X_train,
                                         y_train,
                                         batch_size,
                                         shuffle=False):
            #if train_batches == 1:
            #print(y_train)
            #print(X_train)

            inputs, targets = batch
            train_err += train_fn(inputs, targets)
            train_batches += 1
        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        for batch in iterate_minibatches(X_val,
                                         y_val,
                                         batch_size,
                                         shuffle=False):
            inputs, targets = batch
            err, acc = val_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1
        av_train_err = train_err / train_batches
        av_val_err = val_err / val_batches
        av_val_acc = val_acc / val_batches
        # Then we print the results for this epoch:
        print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs,
                                                   time.time() - start_time))
        print("  training loss:\t\t{:.6f}".format(av_train_err))
        print("  validation loss:\t\t{:.6f}".format(av_val_err))
        print("  validation accuracy:\t\t{:.2f} %".format(av_val_acc * 100))
        if av_val_acc > best_validation_accu:
            best_validation_accu = av_val_acc
            # After training, we compute and print the test error:
            test_err = 0
            test_acc = 0
            test_batches = 0
            for batch in iterate_minibatches(X_test,
                                             y_test,
                                             batch_size,
                                             shuffle=False):
                inputs, targets = batch
                err, acc = val_fn(inputs, targets)
                test_err += err
                test_acc += acc
                test_batches += 1
            av_test_err = test_err / test_batches
            av_test_acc = test_acc / test_batches
            print("Final results:")
            print("  test loss:\t\t\t{:.6f}".format(av_test_err))
            print("  test accuracy:\t\t{:.2f} %".format(av_test_acc * 100))
            # Dump the network weights to a file like this:
            np.savez('weights_lasg_{}_{}'.format(model_type, mark),
                     *lasagne.layers.get_all_param_values(network))
    print('-' * 50)
    print("Best validation accuracy:\t\t{:.2f} %".format(best_validation_accu *
                                                         100))
    print("Best test accuracy:\t\t{:.2f} %".format(av_test_acc * 100))
Beispiel #38
0
class WriteHead(Head):
    r"""
    Write head. In addition to the weight vector, the write head
    also outputs an add vector :math:`a_{t}` and an erase vector
    :math:`e_{t}` defined by

    .. math ::
        \delta_{t} &= \sigma_{delta}(h_{t} W_{delta} + b_{delta})\\
        a_{t} &= \delta_{t} * \sigma_{a}(h_{t} W_{a} + b_{a})
        e_{t} &= \sigma_{e}(h_{t} W_{e} + b_{e})

    Parameters
    ----------
    controller: a :class:`Controller` instance
        The controller of the Neural Turing Machine.
    num_shifts: int
        Number of shifts allowed by the convolutional shift operation
        (centered on 0, eg. ``num_shifts=3`` represents shifts
        in [-1, 0, 1]).
    memory_shape: tuple
        Shape of the NTM's memory
    W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
    b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_sign: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\alpha_{t}`.
    W_hid_to_key: callable, Numpy array or Theano shared variable
    b_hid_to_key: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_key: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`k_{t}`.
    W_hid_to_beta: callable, Numpy array or Theano shared variable
    b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_beta: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\beta_{t}`.
    W_hid_to_gate: callable, Numpy array or Theano shared variable
    b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gate: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`g_{t}`.
    W_hid_to_shift: callable, Numpy array or Theano shared variable
    b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_shift: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`s_{t}`.
    W_hid_to_gamma: callable, Numpy array or Theano shared variable
    b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gamma: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\gamma_{t}`
    W_hid_to_erase: callable, Numpy array or Theano shared variable
    b_hid_to_erase: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_erase: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`e_{t}`
    W_hid_to_add: callable, Numpy array or Theano shared variable
    b_hid_to_add: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_add: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`a_{t}`
    W_hid_to_sign_add: callable, Numpy array, Theano shared variable, or ``None``
    b_hid_to_sign_add: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_sign_add: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\delta_{t}`
    weights_init: callable, Numpy array or Theano shared variable
        Initializer for the initial weight vector (:math:`w_{0}`).
    learn_init: bool
        If ``True``, initial hidden values are learned.
    """
    def __init__(self,
                 controller,
                 num_shifts=3,
                 memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1.,
                                                                high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0.,
                                                               high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.
                 rectify(x),
                 W_hid_to_erase=lasagne.init.GlorotUniform(),
                 b_hid_to_erase=lasagne.init.Constant(0.),
                 nonlinearity_erase=nonlinearities.hard_sigmoid,
                 W_hid_to_add=lasagne.init.GlorotUniform(),
                 b_hid_to_add=lasagne.init.Constant(0.),
                 nonlinearity_add=nonlinearities.ClippedLinear(low=0.,
                                                               high=1.),
                 W_hid_to_sign_add=None,
                 b_hid_to_sign_add=lasagne.init.Constant(0.),
                 nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1.,
                                                                    high=1.),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(WriteHead, self).__init__(controller,
                                        num_shifts=num_shifts,
                                        memory_shape=memory_shape,
                                        W_hid_to_sign=W_hid_to_sign,
                                        b_hid_to_sign=b_hid_to_sign,
                                        nonlinearity_sign=nonlinearity_sign,
                                        W_hid_to_key=W_hid_to_key,
                                        b_hid_to_key=b_hid_to_key,
                                        nonlinearity_key=nonlinearity_key,
                                        W_hid_to_beta=W_hid_to_beta,
                                        b_hid_to_beta=b_hid_to_beta,
                                        nonlinearity_beta=nonlinearity_beta,
                                        W_hid_to_gate=W_hid_to_gate,
                                        b_hid_to_gate=b_hid_to_gate,
                                        nonlinearity_gate=nonlinearity_gate,
                                        W_hid_to_shift=W_hid_to_shift,
                                        b_hid_to_shift=b_hid_to_shift,
                                        nonlinearity_shift=nonlinearity_shift,
                                        W_hid_to_gamma=W_hid_to_gamma,
                                        b_hid_to_gamma=b_hid_to_gamma,
                                        nonlinearity_gamma=nonlinearity_gamma,
                                        weights_init=weights_init,
                                        learn_init=learn_init,
                                        **kwargs)

        self.erase = DenseLayer(controller,
                                num_units=self.memory_shape[1],
                                W=W_hid_to_erase,
                                b=b_hid_to_erase,
                                nonlinearity=nonlinearity_erase,
                                name=self.basename + '.erase')
        self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b

        self.add = DenseLayer(controller,
                              num_units=self.memory_shape[1],
                              W=W_hid_to_add,
                              b=b_hid_to_add,
                              nonlinearity=nonlinearity_add,
                              name=self.basename + '.add')
        self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b

        if W_hid_to_sign_add is not None:
            self.sign_add = DenseLayer(controller,
                                       num_units=self.memory_shape[1],
                                       W=W_hid_to_sign_add,
                                       b=b_hid_to_sign_add,
                                       nonlinearity=nonlinearity_sign_add,
                                       name=self.basename + '.sign_add')
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b
        else:
            self.sign_add = None
            self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None

    def get_params(self, **tags):
        params = super(WriteHead, self).get_params(**tags)
        params += self.erase.get_params(**tags)
        params += self.add.get_params(**tags)
        if self.sign_add is not None:
            params += self.sign_add.get_params(**tags)

        return params
Beispiel #39
0
num_classes = nClasses + 1

soft = lasagne.nonlinearities.softmax
tanh = lasagne.nonlinearities.tanh
identity = lasagne.nonlinearities.identity

l_in = InputLayer(shape=(num_batch, input_seq_len, num_feat))
batchsize, seqlen, _ = l_in.input_var.shape

l_noise = GaussianNoiseLayer(l_in, sigma=0.6)
# l_mask  = InputLayer(shape=(batchsize, seqlen))
# l_rnn_1 = LSTMLayer(l_noise, num_units=L1_UNITS, mask_input=l_mask)
l_rnn_1 = LSTMLayer(l_noise, num_units=L1_UNITS)
l_rnn_2 = LSTMLayer(l_rnn_1, num_units=L2_UNITS)
l_shp = ReshapeLayer(l_rnn_2, (-1, L2_UNITS))
l_out = DenseLayer(l_shp, num_units=num_classes, nonlinearity=identity)
l_out_shp = ReshapeLayer(l_out, (batchsize, seqlen, num_classes))

l_out_softmax = NonlinearityLayer(l_out, nonlinearity=soft)
l_out_softmax_shp = ReshapeLayer(l_out_softmax, (batchsize, seqlen, num_classes))

output_lin_ctc = L.get_output(l_out_shp)
network_output = L.get_output(l_out_softmax_shp)
all_params = L.get_all_params(l_rnn_2, trainable=True)

# ## Costs, Gradients & Training Functions

# Cost functions
target_values = T.imatrix('target_output')
input_values = T.imatrix()
def build_network_resnet50(input, nbClasses):
    net = {}
    net['input'] = InputLayer(shape=(None, 1, 120, 120), input_var=input)
    sub_net, parent_layer_name = build_simple_block(
        net['input'], ['conv1', 'bn_conv1', 'conv1_relu'],
        64,
        7,
        3,
        2,
        use_bias=True)
    net.update(sub_net)
    net['pool1'] = PoolLayer(net[parent_layer_name],
                             pool_size=3,
                             stride=2,
                             pad=0,
                             mode='max',
                             ignore_border=False)
    block_size = list('abc')
    parent_layer_name = 'pool1'
    for c in block_size:
        if c == 'a':
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name], 1, 1, True, 4, ix='2%s' % c)
        else:
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='2%s' % c)
        net.update(sub_net)

    block_size = list('abcd')
    for c in block_size:
        if c == 'a':
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name],
                1.0 / 2,
                1.0 / 2,
                True,
                4,
                ix='3%s' % c)
        else:
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='3%s' % c)
        net.update(sub_net)

    block_size = list('abcdef')
    for c in block_size:
        if c == 'a':
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name],
                1.0 / 2,
                1.0 / 2,
                True,
                4,
                ix='4%s' % c)
        else:
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='4%s' % c)
        net.update(sub_net)

    block_size = list('abc')
    for c in block_size:
        if c == 'a':
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name],
                1.0 / 2,
                1.0 / 2,
                True,
                4,
                ix='5%s' % c)
        else:
            sub_net, parent_layer_name = build_residual_block(
                net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='5%s' % c)
        net.update(sub_net)
    net['pool5'] = PoolLayer(net[parent_layer_name],
                             pool_size=7,
                             stride=1,
                             pad=0,
                             mode='average_exc_pad',
                             ignore_border=False)
    net['fc1000'] = DenseLayer(
        net['pool5'], num_units=nbClasses,
        nonlinearity=None)  # number output units = nbClasses (global variable)
    net['prob'] = NonlinearityLayer(net['fc1000'], nonlinearity=softmax)

    return net, net['prob']
Beispiel #41
0
def test_space_invaders(
    game_title='SpaceInvaders-v0',
    n_parallel_games=3,
    replay_seq_len=2,
):
    """
    :param game_title: name of atari game in Gym
    :param n_parallel_games: how many games we run in parallel
    :param replay_seq_len: how long is one replay session from a batch
    """

    atari = gym.make(game_title)
    atari.reset()

    # Game Parameters
    n_actions = atari.action_space.n
    observation_shape = (None, ) + atari.observation_space.shape
    action_names = atari.get_action_meanings()
    del atari
    # ##### Agent observations

    # image observation at current tick goes here
    observation_layer = InputLayer(observation_shape, name="images input")

    # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly
    observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2))

    # Agent memory states
    window_size = 3

    # prev state input
    prev_window = InputLayer(
        (None, window_size) + tuple(observation_reshape.output_shape[1:]),
        name="previous window state")

    # our window
    window = WindowAugmentation(observation_reshape,
                                prev_window,
                                name="new window state")

    memory_dict = {window: prev_window}

    # ##### Neural network body
    # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc

    # pixel-wise maximum over the temporal window (to avoid flickering)
    window_max = ExpressionLayer(window,
                                 lambda a: a.max(axis=1),
                                 output_shape=(None, ) +
                                 window.output_shape[2:])

    # a simple lasagne network (try replacing with any other lasagne network and see what works best)
    nn = DenseLayer(window_max, num_units=50, name='dense0')

    # Agent policy and action picking
    q_eval = DenseLayer(nn,
                        num_units=n_actions,
                        nonlinearity=lasagne.nonlinearities.linear,
                        name="QEvaluator")

    #fakes for a2c
    policy_eval = DenseLayer(nn,
                             num_units=n_actions,
                             nonlinearity=lasagne.nonlinearities.softmax,
                             name="a2c action probas")
    state_value_eval = DenseLayer(nn,
                                  num_units=1,
                                  nonlinearity=None,
                                  name="a2c state values")
    # resolver
    resolver = ProbabilisticResolver(policy_eval, name="resolver")

    # agent
    agent = Agent(observation_layer, memory_dict,
                  (q_eval, policy_eval, state_value_eval), resolver)

    # Since it's a single lasagne network, one can get it's weights, output, etc
    weights = lasagne.layers.get_all_params(resolver, trainable=True)

    # Agent step function
    # # Create and manage a pool of atari sessions to play with

    pool = EnvPool(agent, game_title, n_parallel_games)

    observation_log, action_log, reward_log, _, _, _ = pool.interact(50)

    print(np.array(action_names)[np.array(action_log)[:3, :5]])

    # # experience replay pool
    # Create an environment with all default parameters
    env = SessionPoolEnvironment(observations=observation_layer,
                                 actions=resolver,
                                 agent_memories=agent.agent_states)

    def update_pool(env, pool, n_steps=100):
        """ a function that creates new sessions and ads them into the pool
        throwing the old ones away entirely for simplicity"""

        preceding_memory_states = list(pool.prev_memory_states)

        # get interaction sessions
        observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact(
            n_steps=n_steps)

        # load them into experience replay environment
        env.load_sessions(observation_tensor, action_tensor, reward_tensor,
                          is_alive_tensor, preceding_memory_states)

    # load first  sessions
    update_pool(env, pool, replay_seq_len)

    # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them.
    # ### Training via experience replay

    # get agent's Q-values, policy, etc obtained via experience replay
    _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions(
        env,
        session_length=replay_seq_len,
        batch_size=env.batch_size,
        optimize_experience_replay=True,
    )
    (q_values_sequence, policy_sequence, value_sequence) = estimators

    # Evaluating loss function

    scaled_reward_seq = env.rewards
    # For SpaceInvaders, however, not scaling rewards is at least working

    elwise_mse_loss = 0.

    #1-step algos
    for algo in qlearning, sarsa:
        elwise_mse_loss += algo.get_elementwise_objective(
            q_values_sequence,
            env.actions[0],
            scaled_reward_seq,
            env.is_alive,
            gamma_or_gammas=0.99,
        )
    #qlearning_n_step
    for n in (1, 3, replay_seq_len - 1, replay_seq_len, replay_seq_len + 1,
              None):
        elwise_mse_loss += qlearning_n_step.get_elementwise_objective(
            q_values_sequence,
            env.actions[0],
            scaled_reward_seq,
            env.is_alive,
            gamma_or_gammas=0.99,
            n_steps=n)

    #a2c n_step

    elwise_mse_loss += a2c_n_step.get_elementwise_objective(
        policy_sequence,
        value_sequence[:, :, 0],
        env.actions[0],
        scaled_reward_seq,
        env.is_alive,
        gamma_or_gammas=0.99,
        n_steps=3)

    # compute mean over "alive" fragments
    mse_loss = elwise_mse_loss.sum() / env.is_alive.sum()

    # regularize network weights
    reg_l2 = regularize_network_params(resolver, l2) * 10**-4

    loss = mse_loss + reg_l2

    # Compute weight updates
    updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01)

    # mean session reward
    mean_session_reward = env.rewards.sum(axis=1).mean()

    # # Compile train and evaluation functions

    print('compiling')
    train_fun = theano.function([], [loss, mean_session_reward],
                                updates=updates)
    evaluation_fun = theano.function(
        [], [loss, mse_loss, reg_l2, mean_session_reward])
    print("I've compiled!")

    # # Training loop

    for epoch_counter in range(10):
        update_pool(env, pool, replay_seq_len)
        loss, avg_reward = train_fun()
        full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun()

        print("epoch %i,loss %.5f, rewards: %.5f " %
              (epoch_counter, full_loss, avg_reward_current))
        print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
    def __init__(self, number_words, num_hidden, seq_length, mb_size):
        self.mb_size = mb_size

        x = T.imatrix()

        #sequence x minibatch x index
        one_hot_input = T.ftensor3()

        use_one_hot_input_flag = T.scalar()

        self.indices = x
        self.use_one_hot_input_flag = use_one_hot_input_flag
        self.one_hot_input = one_hot_input

        '''
        flag for input: one-hot or index.  
        If index, compute one-hot and use that.  

        If one-hot, just use one-hot input.  
        '''

        #Time seq x examples x words

        target = T.ivector()

        #word_embeddings = theano.shared(np.random.normal(size = ((number_words, 1, num_hidden))).astype('float32'))

        word_embeddings = theano.shared(np.random.normal(size = ((number_words, num_hidden))).astype('float32'))

        feature_lst = []

        for i in range(0, seq_length):
            #feature = word_embeddings[x[:,i]]
            #instead of this, multiply by one-hot matrix

            one_hot = T.extra_ops.to_one_hot(x[:,i], number_words)

            #W : 30k x 1 x 400
            #one_hot: 128 x 30k
            #one_hot * W
            #128 x 1 x 400

            
            one_hot_use = ifelse(use_one_hot_input_flag, one_hot_input[i], T.extra_ops.to_one_hot(x[:,i], number_words))

            feature = T.reshape(T.dot(one_hot_use, word_embeddings), (1,mb_size,num_hidden)).transpose(1,0,2)

            feature_lst.append(feature)

        features = T.concatenate(feature_lst, 1)

        #example x sequence_position x feature
        l_lstm_1 = LSTMLayer((seq_length, mb_size, num_hidden), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0)
        l_lstm_2 = LSTMLayer((seq_length, mb_size, num_hidden * 2), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0, backwards = True)
        l_lstm_3 = LSTMLayer((seq_length, mb_size, num_hidden * 2), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0)

        lstm_1_out = l_lstm_1.get_output_for([features])
        lstm_2_out = l_lstm_2.get_output_for([T.concatenate([lstm_1_out, features], axis = 2)])
        lstm_3_out = l_lstm_3.get_output_for([T.concatenate([lstm_2_out, features], axis = 2)])

        final_out = T.mean(lstm_3_out, axis = 1)

        #final_out = T.mean(features, axis = 1)
        h_out_1 = DenseLayer((mb_size, num_hidden), num_units = 2048, nonlinearity=lasagne.nonlinearities.rectify)

        h_out_2 = DenseLayer((mb_size, 2048), num_units = 2048, nonlinearity=lasagne.nonlinearities.rectify)

        h_out_3 = DenseLayer((mb_size, 2048), num_units = 1, nonlinearity=None)

        h_out_1_value = h_out_1.get_output_for(final_out)
        h_out_2_value = h_out_2.get_output_for(h_out_1_value)
        h_out_3_value = h_out_3.get_output_for(h_out_2_value)
        classification = T.nnet.sigmoid(h_out_3_value)
        self.loss = T.mean(T.nnet.binary_crossentropy(output = classification.flatten(), target = target))
        self.params = lasagne.layers.get_all_params(h_out_1,trainable=True) + lasagne.layers.get_all_params(h_out_3,trainable=True) + [word_embeddings] + lasagne.layers.get_all_params(l_lstm_1, trainable = True) + lasagne.layers.get_all_params(l_lstm_2, trainable = True)

        self.params += lasagne.layers.get_all_params(h_out_2,trainable=True)
        self.params += lasagne.layers.get_all_params(l_lstm_3,trainable=True)

        all_grads = T.grad(self.loss, self.params)

        for j in range(0, len(all_grads)):
            all_grads[j] = T.switch(T.isnan(all_grads[j]), T.zeros_like(all_grads[j]), all_grads[j])

        scaled_grads = lasagne.updates.total_norm_constraint(all_grads, 5.0)

        updates = lasagne.updates.adam(scaled_grads, self.params)
        self.train_func = theano.function(inputs = [x, target, use_one_hot_input_flag, one_hot_input], outputs = {'l' : self.loss, 'c' : classification, 'g_w' : T.sum(T.sqr(T.grad(self.loss, word_embeddings)))}, updates = updates)
        self.evaluate_func = theano.function(inputs = [x, use_one_hot_input_flag, one_hot_input], outputs = {'c' : classification})