def discriminator(x, z, params, mb_size, num_hidden, num_latent):

    x_z = T.concatenate([x,z], axis = 1)


    h_out_1 = DenseLayer((mb_size, num_hidden + num_latent), num_units = num_hidden, nonlinearity=None, W = params['W_disc_1'])

    h_out_2 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_2'])

    h_out_3 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_3'])

    h_out_4 = DenseLayer((mb_size, 1), num_units = 1, nonlinearity=None, W = params['W_disc_4'], b = params['b_disc_4'])

    h_out_1_value = h_out_1.get_output_for(x_z)

    h_out_1_value = T.maximum(0.0, (h_out_1_value - T.mean(h_out_1_value, axis = 0)) / (1.0 + T.std(h_out_1_value, axis = 0)) + params['b_disc_1'])

    h_out_2_value = h_out_2.get_output_for(h_out_1_value)

    h_out_2_value = T.maximum(0.0, (h_out_2_value - T.mean(h_out_2_value, axis = 0)) / (1.0 + T.std(h_out_2_value, axis = 0)) + params['b_disc_2'])

    h_out_3_value = h_out_3.get_output_for(h_out_2_value)

    h_out_3_value = T.maximum(0.0, (h_out_3_value - T.mean(h_out_3_value, axis = 0)) / (1.0 + T.std(h_out_3_value, axis = 0)) + params['b_disc_3'])

    h_out_4_value = h_out_4.get_output_for(h_out_3_value)

    raw_y = h_out_4_value

    classification = T.nnet.sigmoid(raw_y)

    results = {'c' : classification}

    return results
Beispiel #2
0
    def __init__(self, num_hidden, num_features, seq_length, mb_size, tf_states, rf_states):
        
        tf_states = T.specify_shape(tf_states, (seq_length, mb_size, num_features))
        rf_states = T.specify_shape(rf_states, (seq_length, mb_size, num_features))

        hidden_state_features = T.specify_shape(T.concatenate([tf_states, rf_states], axis = 1), (seq_length, mb_size * 2, num_features))

        gru_params_1 = init_tparams(param_init_gru(None, {}, prefix = "gru1", dim = num_hidden, nin = num_features))
        #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features))
        #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features))

        gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix = 'gru1')[0]
        #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0]
        #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0]

        final_out_recc = T.specify_shape(T.mean(gru_1_out, axis = 0), (mb_size * 2, num_hidden))

        h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units = 1, nonlinearity=None)

        h_out_1_value = h_out_1.get_output_for(final_out_recc)
        h_out_4_value = h_out_4.get_output_for(h_out_1_value)

        raw_y = h_out_4_value
        #raw_y = T.clip(h_out_4_value, -10.0, 10.0)
        classification = T.nnet.sigmoid(raw_y)

        #tf comes before rf.  
        p_real =  classification[:mb_size]
        p_gen  = classification[mb_size:]

        #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r))

        self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean()
        self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean()
        self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean()
        self.d_cost = self.d_cost_real + self.d_cost_gen
        self.g_cost = self.g_cost_d


        self.classification = classification

        self.params = []
        self.params += lasagne.layers.get_all_params(h_out_4,trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True)
        self.params += lasagne.layers.get_all_params(h_out_1,trainable=True)

        self.params += gru_params_1.values()
        #self.params += gru_params_2.values()
        #self.params += gru_params_3.values()

        self.accuracy = T.mean(T.eq(T.ones(p_real.shape).flatten(), T.gt(p_real, 0.5).flatten())) + T.mean(T.eq(T.ones(p_gen.shape).flatten(), T.lt(p_gen, 0.5).flatten()))
Beispiel #3
0
def define_network(x, params, config):

    num_hidden = config['num_hidden']
    mb_size = config['mb_size']
    num_latent = config['num_latent']

    enc = encoder(x, params, config)

    mean_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_mean_W'], b = params['z_mean_b'])
    std_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_std_W'], b = params['z_std_b'])

    mean = mean_layer.get_output_for(enc['h'])
    std = T.exp(std_layer.get_output_for(enc['h']))

    import random as rng
    srng = theano.tensor.shared_randomstreams.RandomStreams(420)

    z_sampled = srng.normal(size = mean.shape, avg = 0.0, std = 1.0)
    z_extra = 0.0 * srng.normal(size = mean.shape, avg = 0.0, std = 1.0)

    z_reconstruction = mean + (0.0 + std * 0.0) * srng.normal(size = mean.shape, avg = 0.0, std = 1.0)

    #z_var = std**2
    z_loss = 0.0 * 0.5 * T.sum(T.clip(mean**2, 4.0, 999999.9) + std**2 - T.log(std**2) - 1.0)

    dec_reconstruction = decoder(z_reconstruction, z_extra, params, config)
    dec_sampled = decoder(z_sampled, z_extra, params, config)

    interp_lst = []

    for j in range(0,128):
        interp_lst.append(z_reconstruction[0] * (j/128.0) + z_reconstruction[-1] * (1 - j / 128.0))

    z_interp = T.concatenate([interp_lst], axis = 1)

    dec_interp = decoder(z_interp, z_extra, params, config)

    results_map = {'reconstruction' : dec_reconstruction['h'], 'z_loss' : z_loss, 'sample' : dec_sampled['h'], 'interp' : dec_interp['h'], 'z' : z_reconstruction}

    return results_map
Beispiel #4
0
    def __init__(self, number_words, num_hidden, seq_length, mb_size):
        self.mb_size = mb_size
        x = T.imatrix()
        target = T.ivector()
        word_embeddings = theano.shared(
            np.random.normal(size=((number_words, 1,
                                    num_hidden))).astype('float32'))
        feature_lst = []
        for i in range(0, seq_length):
            feature = word_embeddings[x[:, i]]
            feature_lst.append(feature)
        features = T.concatenate(feature_lst, 1)

        #example x sequence_position x feature
        #inp = InputLayer(shape = (seq_length, mb_size, num_hidden), input_var = features)
        l_lstm_1 = LSTMLayer((seq_length, mb_size, num_hidden),
                             num_units=num_hidden,
                             nonlinearity=lasagne.nonlinearities.tanh)
        l_lstm_2 = LSTMLayer((seq_length, mb_size, num_hidden),
                             num_units=num_hidden,
                             nonlinearity=lasagne.nonlinearities.tanh)

        #minibatch x sequence x feature
        final_out = T.mean(l_lstm_2.get_output_for(
            [l_lstm_1.get_output_for([features])]),
                           axis=1)

        #final_out = T.mean(features, axis = 1)
        h_out = DenseLayer((mb_size, num_hidden),
                           num_units=1,
                           nonlinearity=None)
        h_out_value = h_out.get_output_for(final_out)
        classification = T.nnet.sigmoid(h_out_value)
        self.loss = T.mean(
            T.nnet.binary_crossentropy(output=classification.flatten(),
                                       target=target))
        self.params = lasagne.layers.get_all_params(h_out, trainable=True) + [
            word_embeddings
        ] + lasagne.layers.get_all_params(
            l_lstm_1, trainable=True) + lasagne.layers.get_all_params(
                l_lstm_2, trainable=True)
        updates = lasagne.updates.adam(self.loss, self.params)
        self.train_func = theano.function(inputs=[x, target],
                                          outputs={
                                              'l': self.loss,
                                              'c': classification
                                          },
                                          updates=updates)
        self.evaluate_func = theano.function(inputs=[x],
                                             outputs={'c': classification})
def decoder(z, params, config):

    mb_size = config['mb_size']
    num_latent = config['num_latent']
    num_hidden = config['num_hidden']

    h_out_1 = HiddenLayer(num_in = num_latent, num_out = num_hidden, W = params['W_dec_1'], b = params['b_dec_1'], activation = 'relu', batch_norm = True)

    h_out_2 = HiddenLayer(num_in = num_hidden, num_out = num_hidden, W = params['W_dec_2'], b = params['b_dec_2'], activation = 'relu', batch_norm = True)

    h_out_3 = DenseLayer((mb_size, num_hidden), num_units = 4000, nonlinearity=None, W = params['W_dec_3'], b = params['b_dec_3'])

    h_out_1_value = h_out_1.output(z)
    h_out_2_value = h_out_2.output(h_out_1_value)
    h_out_3_value = h_out_3.get_output_for(h_out_2_value)

    return {'h' : h_out_3_value}
Beispiel #6
0
def define_network(x, params, config):

    num_hidden = config['num_hidden']
    mb_size = config['mb_size']
    num_latent = config['num_latent']

    enc = encoder(x, params, config)

    mean_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_mean_W'], b = params['z_mean_b'])
    #std_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_std_W'], b = params['z_std_b'])

    mean = mean_layer.get_output_for(enc['h'])
    #std = T.exp(std_layer.get_output_for(enc['h']))

    import random as rng
    srng = theano.tensor.shared_randomstreams.RandomStreams(420)

    z_sampled = srng.normal(size = mean.shape, avg = 0.0, std = 1.0)
    z_extra = 0.0 * srng.normal(size = mean.shape, avg = 0.0, std = 1.0)

    z_reconstruction = mean

    #z_var = std**2
    z_loss = 0.0 * T.sum(mean)#0.001 * 0.5 * T.sum(mean**2 + z_var - T.log(z_var) - 1.0)

    dec_reconstruction = decoder(z_reconstruction, z_extra, params, config)
    dec_sampled = decoder(z_sampled, z_extra, params, config)

    interp_lst = []

    for j in range(0,128):
        interp_lst.append(z_reconstruction[0] * (j/128.0) + z_reconstruction[-1] * (1 - j / 128.0))

    z_interp = T.concatenate([interp_lst], axis = 1)

    dec_interp = decoder(z_interp, z_extra, params, config)

    results_map = {'reconstruction' : dec_reconstruction['h'], 'z_loss' : z_loss, 'sample' : dec_sampled['h'], 'interp' : dec_interp['h'], 'z' : z_reconstruction}

    return results_map
Beispiel #7
0
    def __init__(self, num_hidden, num_features, mb_size,
                 hidden_state_features, target):
        self.mb_size = mb_size
        #self.seq_length = seq_length

        #using 0.8
        hidden_state_features = dropout(hidden_state_features, 1.0)

        gru_params_1 = init_tparams(
            param_init_gru(None, {},
                           prefix="gru1",
                           dim=num_hidden,
                           nin=num_features))
        gru_params_2 = init_tparams(
            param_init_gru(None, {},
                           prefix="gru2",
                           dim=num_hidden,
                           nin=num_hidden + num_features))

        gru_1_out = gru_layer(gru_params_1,
                              hidden_state_features,
                              None,
                              prefix='gru1',
                              gradient_steps=100)[0]
        gru_2_out = gru_layer(gru_params_2,
                              T.concatenate([gru_1_out, hidden_state_features],
                                            axis=2),
                              None,
                              prefix='gru2',
                              backwards=True,
                              gradient_steps=100)[0]

        self.gru_1_out = gru_1_out

        final_out_recc = T.mean(gru_2_out, axis=0)

        h_out_1 = DenseLayer((mb_size * 2, num_hidden),
                             num_units=num_hidden,
                             nonlinearity=lasagne.nonlinearities.rectify)
        h_out_2 = DenseLayer((mb_size * 2, num_hidden),
                             num_units=num_hidden,
                             nonlinearity=lasagne.nonlinearities.rectify)
        h_out_4 = DenseLayer((mb_size * 2, num_hidden),
                             num_units=1,
                             nonlinearity=None)

        h_out_1_value = dropout(h_out_1.get_output_for(final_out_recc), 1.0)
        h_out_2_value = dropout(h_out_2.get_output_for(h_out_1_value), 1.0)
        h_out_4_value = h_out_4.get_output_for(h_out_2_value)

        raw_y = T.clip(h_out_4_value, -10.0, 10.0)

        classification = T.nnet.sigmoid(raw_y)

        self.accuracy = T.mean(
            T.eq(target,
                 T.gt(classification, 0.5).flatten()))

        p_real = classification[0:mb_size]
        p_gen = classification[mb_size:mb_size * 2]

        self.d_cost_real = bce(p_real, T.ones(p_real.shape)).mean()
        self.d_cost_gen = bce(p_gen, T.zeros(p_gen.shape)).mean()

        self.g_cost_real = bce(p_real, T.zeros(p_gen.shape)).mean()
        self.g_cost_gen = bce(p_gen, T.ones(p_real.shape)).mean()

        #self.g_cost = self.g_cost_gen
        self.g_cost = self.g_cost_real + self.g_cost_gen

        print "pulling both TF and PF togeher"

        self.d_cost = self.d_cost_real + self.d_cost_gen
        #if d_cost < 1.0, use g cost.

        self.d_cost = T.switch(
            T.gt(self.accuracy, 0.95) * T.gt(p_real.mean(), 0.99) *
            T.lt(p_gen.mean(), 0.01), 0.0, self.d_cost)
        '''
        gX = gen(Z, *gen_params)

        p_real = discrim(X, *discrim_params)
        p_gen = discrim(gX, *discrim_params)

        d_cost_real = bce(p_real, T.ones(p_real.shape)).mean()
        d_cost_gen = bce(p_gen, T.zeros(p_gen.shape)).mean()
        g_cost_d = bce(p_gen, T.ones(p_gen.shape)).mean()

        d_cost = d_cost_real + d_cost_gen
        g_cost = g_cost_d

        cost = [g_cost, d_cost, g_cost_d, d_cost_real, d_cost_gen]
        d_updates = d_updater(discrim_params, d_cost)
        g_updates = g_updater(gen_params, g_cost)

        '''

        self.classification = classification

        self.params = []
        self.params += lasagne.layers.get_all_params(h_out_4, trainable=True)
        self.params += lasagne.layers.get_all_params(h_out_1, trainable=True)
        self.params += lasagne.layers.get_all_params(h_out_2, trainable=True)

        #self.params += h_out_1.getParams() + h_out_2.getParams() + h_out_3.getParams()

        #        self.params += lasagne.layers.get_all_params(h_initial_1,trainable=True)
        #        self.params += lasagne.layers.get_all_params(h_initial_2,trainable=True)

        self.params += gru_params_1.values()
        self.params += gru_params_2.values()
        '''
        layerParams = c1.getParams()
        for paramKey in layerParams:
            self.params += [layerParams[paramKey]]
        layerParams = c2.getParams()
        for paramKey in layerParams:
            self.params += [layerParams[paramKey]]
        layerParams = c3.getParams()
        for paramKey in layerParams:
            self.params += [layerParams[paramKey]]

        '''

        #all_grads = T.grad(self.loss, self.params)
        #for j in range(0, len(all_grads)):
        #    all_grads[j] = T.switch(T.isnan(all_grads[j]), T.zeros_like(all_grads[j]), all_grads[j])
        #self.updates = lasagne.updates.adam(all_grads, self.params, learning_rate = 0.0001, beta1 = 0.5)
        '''
Beispiel #8
0
class Head(Layer):
    r"""
    The base class :class:`Head` represents a generic head for the
    Neural Turing Machine. The heads are responsible for the read/write
    operations on the memory. An instance of :class:`Head` outputs a
    weight vector defined by

    .. math ::
        \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\
        k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\
        \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\
        g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\
        s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\
        \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma})

    .. math ::
        w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\
        w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\
        \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\
        w_{t} \propto \tilde{w}_{t}^{\gamma_{t}}

    Parameters
    ----------
    controller: a :class:`Controller` instance
        The controller of the Neural Turing Machine.
    num_shifts: int
        Number of shifts allowed by the convolutional shift operation
        (centered on 0, eg. ``num_shifts=3`` represents shifts
        in [-1, 0, 1]).
    memory_shape: tuple
        Shape of the NTM's memory
    W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the weights for the parameter
        :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is
        ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape
        ``(controller.num_units, memory_shape[1])``.
    b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the biases for the parameter
        :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix
        with shape ``(memory_shape[1],)``.
    nonlinearity_sign: callable or ``None``
        The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If
        ``None``, the nonlinearity is ``identity``.
    W_hid_to_key: callable, Numpy array or Theano shared variable
    b_hid_to_key: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_key: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`k_{t}`.
    W_hid_to_beta: callable, Numpy array or Theano shared variable
    b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_beta: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\beta_{t}`.
    W_hid_to_gate: callable, Numpy array or Theano shared variable
    b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gate: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`g_{t}`.
    W_hid_to_shift: callable, Numpy array or Theano shared variable
    b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_shift: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`s_{t}`.
    W_hid_to_gamma: callable, Numpy array or Theano shared variable
    b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gamma: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\gamma_{t}`
    weights_init: callable, Numpy array or Theano shared variable
        Initializer for the initial weight vector (:math:`w_{0}`).
    learn_init: bool
        If ``True``, initial hidden values are learned.
    """
    def __init__(self, controller, num_shifts=3, memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(Head, self).__init__(controller, **kwargs)

        self.memory_shape = memory_shape
        self.basename = kwargs.get('name', 'head')
        self.learn_init = learn_init

        if W_hid_to_sign is not None:
            self.sign = DenseLayer(controller, num_units=self.memory_shape[1],
                W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign,
                name=self.basename + '.sign')
            self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b
        else:
            self.sign = None
            self.W_hid_to_sign, self.b_hid_to_sign = None, None

        self.key = DenseLayer(controller, num_units=self.memory_shape[1],
            W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key,
            name=self.basename + '.key')
        self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b
        
        self.beta = DenseLayer(controller, num_units=1,
            W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta,
            name=self.basename + '.beta')
        self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b

        self.gate = DenseLayer(controller, num_units=1,
            W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate,
            name=self.basename + '.gate')
        self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b

        self.num_shifts = num_shifts
        self.shift = DenseLayer(controller, num_units=num_shifts,
            W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift,
            name=self.basename + '.shift')
        self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b

        self.gamma = DenseLayer(controller, num_units=1,
            W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma,
            name=self.basename + '.gamma')
        self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b

        self.weights_init = self.add_param(
            weights_init, (1, self.memory_shape[0]),
            name='weights_init', trainable=learn_init, regularizable=False)

    def get_output_for(self, h_t, w_tm1, M_t, **kwargs):
        if self.sign is not None:
            sign_t = self.sign.get_output_for(h_t, **kwargs)
        else:
            sign_t = 1.
        k_t = self.key.get_output_for(h_t, **kwargs)
        beta_t = self.beta.get_output_for(h_t, **kwargs)
        g_t = self.gate.get_output_for(h_t, **kwargs)
        s_t = self.shift.get_output_for(h_t, **kwargs)
        gamma_t = self.gamma.get_output_for(h_t, **kwargs)

        # Content Adressing (3.3.1)
        beta_t = T.addbroadcast(beta_t, 1)
        betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t)
        w_c = lasagne.nonlinearities.softmax(betaK)

        # Interpolation (3.3.2)
        g_t = T.addbroadcast(g_t, 1)
        w_g = g_t * w_c + (1. - g_t) * w_tm1

        # Convolutional Shift (3.3.2)
        w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1)
        conv_filter = s_t.dimshuffle(0, 'x', 'x', 1)
        pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2)
        w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3)
        convolution = T.nnet.conv2d(w_g_padded, conv_filter,
            input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]),
            filter_shape=(self.input_shape[0], 1, 1, self.num_shifts),
            subsample=(1, 1),
            border_mode='valid')
        w_tilde = convolution[:, 0, 0, :]

        # Sharpening (3.3.2)
        gamma_t = T.addbroadcast(gamma_t, 1)
        w = T.pow(w_tilde + 1e-6, gamma_t)
        w /= T.sum(w)

        return w

    def get_params(self, **tags):
        params = super(Head, self).get_params(**tags)
        if self.sign is not None:
            params += self.sign.get_params(**tags)
        params += self.key.get_params(**tags)
        params += self.beta.get_params(**tags)
        params += self.gate.get_params(**tags)
        params += self.shift.get_params(**tags)
        params += self.gamma.get_params(**tags)

        return params
Beispiel #9
0
    def __init__(self, num_hidden, num_features, seq_length, mb_size,
                 tf_states, rf_states):

        tf_states = T.specify_shape(tf_states,
                                    (seq_length, mb_size, num_features))
        rf_states = T.specify_shape(rf_states,
                                    (seq_length, mb_size, num_features))

        hidden_state_features = T.specify_shape(
            T.concatenate([tf_states, rf_states], axis=1),
            (seq_length, mb_size * 2, num_features))

        gru_params_1 = init_tparams(
            param_init_gru(None, {},
                           prefix="gru1",
                           dim=num_hidden,
                           nin=num_features))
        #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features))
        #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features))

        gru_1_out = gru_layer(gru_params_1,
                              hidden_state_features,
                              None,
                              prefix='gru1')[0]
        #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0]
        #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0]

        final_out_recc = T.specify_shape(T.mean(gru_1_out, axis=0),
                                         (mb_size * 2, num_hidden))

        h_out_1 = DenseLayer((mb_size * 2, num_hidden),
                             num_units=num_hidden,
                             nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify)
        h_out_4 = DenseLayer((mb_size * 2, num_hidden),
                             num_units=1,
                             nonlinearity=None)

        h_out_1_value = h_out_1.get_output_for(final_out_recc)
        h_out_4_value = h_out_4.get_output_for(h_out_1_value)

        raw_y = h_out_4_value
        #raw_y = T.clip(h_out_4_value, -10.0, 10.0)
        classification = T.nnet.sigmoid(raw_y)

        #tf comes before rf.
        p_real = classification[:mb_size]
        p_gen = classification[mb_size:]

        #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r))

        self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean()
        self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean()
        self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean()
        self.d_cost = self.d_cost_real + self.d_cost_gen
        self.g_cost = self.g_cost_d

        self.classification = classification

        self.params = []
        self.params += lasagne.layers.get_all_params(h_out_4, trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True)
        #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True)
        self.params += lasagne.layers.get_all_params(h_out_1, trainable=True)

        self.params += gru_params_1.values()
        #self.params += gru_params_2.values()
        #self.params += gru_params_3.values()

        self.accuracy = T.mean(
            T.eq(T.ones(p_real.shape).flatten(),
                 T.gt(p_real, 0.5).flatten())) + T.mean(
                     T.eq(
                         T.ones(p_gen.shape).flatten(),
                         T.lt(p_gen, 0.5).flatten()))
Beispiel #10
0
class Head(Layer):
    r"""
    The base class :class:`Head` represents a generic head for the
    Neural Turing Machine. The heads are responsible for the read/write
    operations on the memory. An instance of :class:`Head` outputs a
    weight vector defined by

    .. math ::
        \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\
        k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\
        \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\
        g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\
        s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\
        \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma})

    .. math ::
        w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\
        w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\
        \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\
        w_{t} \propto \tilde{w}_{t}^{\gamma_{t}}

    Parameters
    ----------
    controller: a :class:`Controller` instance
        The controller of the Neural Turing Machine.
    num_shifts: int
        Number of shifts allowed by the convolutional shift operation
        (centered on 0, eg. ``num_shifts=3`` represents shifts
        in [-1, 0, 1]).
    memory_shape: tuple
        Shape of the NTM's memory
    W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the weights for the parameter
        :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is
        ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape
        ``(controller.num_units, memory_shape[1])``.
    b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None``
        If callable, initializer of the biases for the parameter
        :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix
        with shape ``(memory_shape[1],)``.
    nonlinearity_sign: callable or ``None``
        The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If
        ``None``, the nonlinearity is ``identity``.
    W_hid_to_key: callable, Numpy array or Theano shared variable
    b_hid_to_key: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_key: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`k_{t}`.
    W_hid_to_beta: callable, Numpy array or Theano shared variable
    b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_beta: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\beta_{t}`.
    W_hid_to_gate: callable, Numpy array or Theano shared variable
    b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gate: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`g_{t}`.
    W_hid_to_shift: callable, Numpy array or Theano shared variable
    b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_shift: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`s_{t}`.
    W_hid_to_gamma: callable, Numpy array or Theano shared variable
    b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None``
    nonlinearity_gamma: callable or ``None``
        Weights, biases and nonlinearity for parameter :math:`\gamma_{t}`
    weights_init: callable, Numpy array or Theano shared variable
        Initializer for the initial weight vector (:math:`w_{0}`).
    learn_init: bool
        If ``True``, initial hidden values are learned.
    """
    def __init__(self,
                 controller,
                 num_shifts=3,
                 memory_shape=(128, 20),
                 W_hid_to_sign=None,
                 b_hid_to_sign=lasagne.init.Constant(0.),
                 nonlinearity_sign=nonlinearities.ClippedLinear(low=-1.,
                                                                high=1.),
                 W_hid_to_key=lasagne.init.GlorotUniform(),
                 b_hid_to_key=lasagne.init.Constant(0.),
                 nonlinearity_key=nonlinearities.ClippedLinear(low=0.,
                                                               high=1.),
                 W_hid_to_beta=lasagne.init.GlorotUniform(),
                 b_hid_to_beta=lasagne.init.Constant(0.),
                 nonlinearity_beta=lasagne.nonlinearities.rectify,
                 W_hid_to_gate=lasagne.init.GlorotUniform(),
                 b_hid_to_gate=lasagne.init.Constant(0.),
                 nonlinearity_gate=nonlinearities.hard_sigmoid,
                 W_hid_to_shift=lasagne.init.GlorotUniform(),
                 b_hid_to_shift=lasagne.init.Constant(0.),
                 nonlinearity_shift=lasagne.nonlinearities.softmax,
                 W_hid_to_gamma=lasagne.init.GlorotUniform(),
                 b_hid_to_gamma=lasagne.init.Constant(0.),
                 nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.
                 rectify(x),
                 weights_init=init.OneHot(),
                 learn_init=False,
                 **kwargs):
        super(Head, self).__init__(controller, **kwargs)

        self.memory_shape = memory_shape
        self.basename = kwargs.get('name', 'head')
        self.learn_init = learn_init

        if W_hid_to_sign is not None:
            self.sign = DenseLayer(controller,
                                   num_units=self.memory_shape[1],
                                   W=W_hid_to_sign,
                                   b=b_hid_to_sign,
                                   nonlinearity=nonlinearity_sign,
                                   name=self.basename + '.sign')
            self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b
        else:
            self.sign = None
            self.W_hid_to_sign, self.b_hid_to_sign = None, None

        self.key = DenseLayer(controller,
                              num_units=self.memory_shape[1],
                              W=W_hid_to_key,
                              b=b_hid_to_key,
                              nonlinearity=nonlinearity_key,
                              name=self.basename + '.key')
        self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b

        self.beta = DenseLayer(controller,
                               num_units=1,
                               W=W_hid_to_beta,
                               b=b_hid_to_beta,
                               nonlinearity=nonlinearity_beta,
                               name=self.basename + '.beta')
        self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b

        self.gate = DenseLayer(controller,
                               num_units=1,
                               W=W_hid_to_gate,
                               b=b_hid_to_gate,
                               nonlinearity=nonlinearity_gate,
                               name=self.basename + '.gate')
        self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b

        self.num_shifts = num_shifts
        self.shift = DenseLayer(controller,
                                num_units=num_shifts,
                                W=W_hid_to_shift,
                                b=b_hid_to_shift,
                                nonlinearity=nonlinearity_shift,
                                name=self.basename + '.shift')
        self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b

        self.gamma = DenseLayer(controller,
                                num_units=1,
                                W=W_hid_to_gamma,
                                b=b_hid_to_gamma,
                                nonlinearity=nonlinearity_gamma,
                                name=self.basename + '.gamma')
        self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b

        self.weights_init = self.add_param(weights_init,
                                           (1, self.memory_shape[0]),
                                           name='weights_init',
                                           trainable=learn_init,
                                           regularizable=False)

    def get_output_for(self, h_t, w_tm1, M_t, **kwargs):
        if self.sign is not None:
            sign_t = self.sign.get_output_for(h_t, **kwargs)
        else:
            sign_t = 1.
        k_t = self.key.get_output_for(h_t, **kwargs)
        beta_t = self.beta.get_output_for(h_t, **kwargs)
        g_t = self.gate.get_output_for(h_t, **kwargs)
        s_t = self.shift.get_output_for(h_t, **kwargs)
        gamma_t = self.gamma.get_output_for(h_t, **kwargs)

        # Content Adressing (3.3.1)
        beta_t = T.addbroadcast(beta_t, 1)
        betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t)
        w_c = lasagne.nonlinearities.softmax(betaK)

        # Interpolation (3.3.2)
        g_t = T.addbroadcast(g_t, 1)
        w_g = g_t * w_c + (1. - g_t) * w_tm1

        # Convolutional Shift (3.3.2)
        w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1)
        conv_filter = s_t.dimshuffle(0, 'x', 'x', 1)
        pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2)
        w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3)
        convolution = T.nnet.conv2d(
            w_g_padded,
            conv_filter,
            input_shape=(self.input_shape[0], 1, 1,
                         self.memory_shape[0] + pad[0] + pad[1]),
            filter_shape=(self.input_shape[0], 1, 1, self.num_shifts),
            subsample=(1, 1),
            border_mode='valid')
        w_tilde = convolution[:, 0, 0, :]

        # Sharpening (3.3.2)
        gamma_t = T.addbroadcast(gamma_t, 1)
        w = T.pow(w_tilde + 1e-6, gamma_t)
        w /= T.sum(w)

        return w

    def get_params(self, **tags):
        params = super(Head, self).get_params(**tags)
        if self.sign is not None:
            params += self.sign.get_params(**tags)
        params += self.key.get_params(**tags)
        params += self.beta.get_params(**tags)
        params += self.gate.get_params(**tags)
        params += self.shift.get_params(**tags)
        params += self.gamma.get_params(**tags)

        return params
    def __init__(self, number_words, num_hidden, seq_length, mb_size):
        self.mb_size = mb_size

        x = T.imatrix()

        #sequence x minibatch x index
        one_hot_input = T.ftensor3()

        use_one_hot_input_flag = T.scalar()

        self.indices = x
        self.use_one_hot_input_flag = use_one_hot_input_flag
        self.one_hot_input = one_hot_input

        '''
        flag for input: one-hot or index.  
        If index, compute one-hot and use that.  

        If one-hot, just use one-hot input.  
        '''

        #Time seq x examples x words

        target = T.ivector()

        #word_embeddings = theano.shared(np.random.normal(size = ((number_words, 1, num_hidden))).astype('float32'))

        word_embeddings = theano.shared(np.random.normal(size = ((number_words, num_hidden))).astype('float32'))

        feature_lst = []

        for i in range(0, seq_length):
            #feature = word_embeddings[x[:,i]]
            #instead of this, multiply by one-hot matrix

            one_hot = T.extra_ops.to_one_hot(x[:,i], number_words)

            #W : 30k x 1 x 400
            #one_hot: 128 x 30k
            #one_hot * W
            #128 x 1 x 400

            
            one_hot_use = ifelse(use_one_hot_input_flag, one_hot_input[i], T.extra_ops.to_one_hot(x[:,i], number_words))

            feature = T.reshape(T.dot(one_hot_use, word_embeddings), (1,mb_size,num_hidden)).transpose(1,0,2)

            feature_lst.append(feature)

        features = T.concatenate(feature_lst, 1)

        #example x sequence_position x feature
        l_lstm_1 = LSTMLayer((seq_length, mb_size, num_hidden), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0)
        l_lstm_2 = LSTMLayer((seq_length, mb_size, num_hidden * 2), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0, backwards = True)
        l_lstm_3 = LSTMLayer((seq_length, mb_size, num_hidden * 2), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0)

        lstm_1_out = l_lstm_1.get_output_for([features])
        lstm_2_out = l_lstm_2.get_output_for([T.concatenate([lstm_1_out, features], axis = 2)])
        lstm_3_out = l_lstm_3.get_output_for([T.concatenate([lstm_2_out, features], axis = 2)])

        final_out = T.mean(lstm_3_out, axis = 1)

        #final_out = T.mean(features, axis = 1)
        h_out_1 = DenseLayer((mb_size, num_hidden), num_units = 2048, nonlinearity=lasagne.nonlinearities.rectify)

        h_out_2 = DenseLayer((mb_size, 2048), num_units = 2048, nonlinearity=lasagne.nonlinearities.rectify)

        h_out_3 = DenseLayer((mb_size, 2048), num_units = 1, nonlinearity=None)

        h_out_1_value = h_out_1.get_output_for(final_out)
        h_out_2_value = h_out_2.get_output_for(h_out_1_value)
        h_out_3_value = h_out_3.get_output_for(h_out_2_value)
        classification = T.nnet.sigmoid(h_out_3_value)
        self.loss = T.mean(T.nnet.binary_crossentropy(output = classification.flatten(), target = target))
        self.params = lasagne.layers.get_all_params(h_out_1,trainable=True) + lasagne.layers.get_all_params(h_out_3,trainable=True) + [word_embeddings] + lasagne.layers.get_all_params(l_lstm_1, trainable = True) + lasagne.layers.get_all_params(l_lstm_2, trainable = True)

        self.params += lasagne.layers.get_all_params(h_out_2,trainable=True)
        self.params += lasagne.layers.get_all_params(l_lstm_3,trainable=True)

        all_grads = T.grad(self.loss, self.params)

        for j in range(0, len(all_grads)):
            all_grads[j] = T.switch(T.isnan(all_grads[j]), T.zeros_like(all_grads[j]), all_grads[j])

        scaled_grads = lasagne.updates.total_norm_constraint(all_grads, 5.0)

        updates = lasagne.updates.adam(scaled_grads, self.params)
        self.train_func = theano.function(inputs = [x, target, use_one_hot_input_flag, one_hot_input], outputs = {'l' : self.loss, 'c' : classification, 'g_w' : T.sum(T.sqr(T.grad(self.loss, word_embeddings)))}, updates = updates)
        self.evaluate_func = theano.function(inputs = [x, use_one_hot_input_flag, one_hot_input], outputs = {'c' : classification})