def discriminator(x, z, params, mb_size, num_hidden, num_latent): x_z = T.concatenate([x,z], axis = 1) h_out_1 = DenseLayer((mb_size, num_hidden + num_latent), num_units = num_hidden, nonlinearity=None, W = params['W_disc_1']) h_out_2 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_2']) h_out_3 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_3']) h_out_4 = DenseLayer((mb_size, 1), num_units = 1, nonlinearity=None, W = params['W_disc_4'], b = params['b_disc_4']) h_out_1_value = h_out_1.get_output_for(x_z) h_out_1_value = T.maximum(0.0, (h_out_1_value - T.mean(h_out_1_value, axis = 0)) / (1.0 + T.std(h_out_1_value, axis = 0)) + params['b_disc_1']) h_out_2_value = h_out_2.get_output_for(h_out_1_value) h_out_2_value = T.maximum(0.0, (h_out_2_value - T.mean(h_out_2_value, axis = 0)) / (1.0 + T.std(h_out_2_value, axis = 0)) + params['b_disc_2']) h_out_3_value = h_out_3.get_output_for(h_out_2_value) h_out_3_value = T.maximum(0.0, (h_out_3_value - T.mean(h_out_3_value, axis = 0)) / (1.0 + T.std(h_out_3_value, axis = 0)) + params['b_disc_3']) h_out_4_value = h_out_4.get_output_for(h_out_3_value) raw_y = h_out_4_value classification = T.nnet.sigmoid(raw_y) results = {'c' : classification} return results
def __init__(self, num_hidden, num_features, seq_length, mb_size, tf_states, rf_states): tf_states = T.specify_shape(tf_states, (seq_length, mb_size, num_features)) rf_states = T.specify_shape(rf_states, (seq_length, mb_size, num_features)) hidden_state_features = T.specify_shape(T.concatenate([tf_states, rf_states], axis = 1), (seq_length, mb_size * 2, num_features)) gru_params_1 = init_tparams(param_init_gru(None, {}, prefix = "gru1", dim = num_hidden, nin = num_features)) #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features)) #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features)) gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix = 'gru1')[0] #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0] #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0] final_out_recc = T.specify_shape(T.mean(gru_1_out, axis = 0), (mb_size * 2, num_hidden)) h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units = 1, nonlinearity=None) h_out_1_value = h_out_1.get_output_for(final_out_recc) h_out_4_value = h_out_4.get_output_for(h_out_1_value) raw_y = h_out_4_value #raw_y = T.clip(h_out_4_value, -10.0, 10.0) classification = T.nnet.sigmoid(raw_y) #tf comes before rf. p_real = classification[:mb_size] p_gen = classification[mb_size:] #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r)) self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean() self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean() self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean() self.d_cost = self.d_cost_real + self.d_cost_gen self.g_cost = self.g_cost_d self.classification = classification self.params = [] self.params += lasagne.layers.get_all_params(h_out_4,trainable=True) #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True) #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True) self.params += lasagne.layers.get_all_params(h_out_1,trainable=True) self.params += gru_params_1.values() #self.params += gru_params_2.values() #self.params += gru_params_3.values() self.accuracy = T.mean(T.eq(T.ones(p_real.shape).flatten(), T.gt(p_real, 0.5).flatten())) + T.mean(T.eq(T.ones(p_gen.shape).flatten(), T.lt(p_gen, 0.5).flatten()))
def define_network(x, params, config): num_hidden = config['num_hidden'] mb_size = config['mb_size'] num_latent = config['num_latent'] enc = encoder(x, params, config) mean_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_mean_W'], b = params['z_mean_b']) std_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_std_W'], b = params['z_std_b']) mean = mean_layer.get_output_for(enc['h']) std = T.exp(std_layer.get_output_for(enc['h'])) import random as rng srng = theano.tensor.shared_randomstreams.RandomStreams(420) z_sampled = srng.normal(size = mean.shape, avg = 0.0, std = 1.0) z_extra = 0.0 * srng.normal(size = mean.shape, avg = 0.0, std = 1.0) z_reconstruction = mean + (0.0 + std * 0.0) * srng.normal(size = mean.shape, avg = 0.0, std = 1.0) #z_var = std**2 z_loss = 0.0 * 0.5 * T.sum(T.clip(mean**2, 4.0, 999999.9) + std**2 - T.log(std**2) - 1.0) dec_reconstruction = decoder(z_reconstruction, z_extra, params, config) dec_sampled = decoder(z_sampled, z_extra, params, config) interp_lst = [] for j in range(0,128): interp_lst.append(z_reconstruction[0] * (j/128.0) + z_reconstruction[-1] * (1 - j / 128.0)) z_interp = T.concatenate([interp_lst], axis = 1) dec_interp = decoder(z_interp, z_extra, params, config) results_map = {'reconstruction' : dec_reconstruction['h'], 'z_loss' : z_loss, 'sample' : dec_sampled['h'], 'interp' : dec_interp['h'], 'z' : z_reconstruction} return results_map
def __init__(self, number_words, num_hidden, seq_length, mb_size): self.mb_size = mb_size x = T.imatrix() target = T.ivector() word_embeddings = theano.shared( np.random.normal(size=((number_words, 1, num_hidden))).astype('float32')) feature_lst = [] for i in range(0, seq_length): feature = word_embeddings[x[:, i]] feature_lst.append(feature) features = T.concatenate(feature_lst, 1) #example x sequence_position x feature #inp = InputLayer(shape = (seq_length, mb_size, num_hidden), input_var = features) l_lstm_1 = LSTMLayer((seq_length, mb_size, num_hidden), num_units=num_hidden, nonlinearity=lasagne.nonlinearities.tanh) l_lstm_2 = LSTMLayer((seq_length, mb_size, num_hidden), num_units=num_hidden, nonlinearity=lasagne.nonlinearities.tanh) #minibatch x sequence x feature final_out = T.mean(l_lstm_2.get_output_for( [l_lstm_1.get_output_for([features])]), axis=1) #final_out = T.mean(features, axis = 1) h_out = DenseLayer((mb_size, num_hidden), num_units=1, nonlinearity=None) h_out_value = h_out.get_output_for(final_out) classification = T.nnet.sigmoid(h_out_value) self.loss = T.mean( T.nnet.binary_crossentropy(output=classification.flatten(), target=target)) self.params = lasagne.layers.get_all_params(h_out, trainable=True) + [ word_embeddings ] + lasagne.layers.get_all_params( l_lstm_1, trainable=True) + lasagne.layers.get_all_params( l_lstm_2, trainable=True) updates = lasagne.updates.adam(self.loss, self.params) self.train_func = theano.function(inputs=[x, target], outputs={ 'l': self.loss, 'c': classification }, updates=updates) self.evaluate_func = theano.function(inputs=[x], outputs={'c': classification})
def decoder(z, params, config): mb_size = config['mb_size'] num_latent = config['num_latent'] num_hidden = config['num_hidden'] h_out_1 = HiddenLayer(num_in = num_latent, num_out = num_hidden, W = params['W_dec_1'], b = params['b_dec_1'], activation = 'relu', batch_norm = True) h_out_2 = HiddenLayer(num_in = num_hidden, num_out = num_hidden, W = params['W_dec_2'], b = params['b_dec_2'], activation = 'relu', batch_norm = True) h_out_3 = DenseLayer((mb_size, num_hidden), num_units = 4000, nonlinearity=None, W = params['W_dec_3'], b = params['b_dec_3']) h_out_1_value = h_out_1.output(z) h_out_2_value = h_out_2.output(h_out_1_value) h_out_3_value = h_out_3.get_output_for(h_out_2_value) return {'h' : h_out_3_value}
def define_network(x, params, config): num_hidden = config['num_hidden'] mb_size = config['mb_size'] num_latent = config['num_latent'] enc = encoder(x, params, config) mean_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_mean_W'], b = params['z_mean_b']) #std_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_std_W'], b = params['z_std_b']) mean = mean_layer.get_output_for(enc['h']) #std = T.exp(std_layer.get_output_for(enc['h'])) import random as rng srng = theano.tensor.shared_randomstreams.RandomStreams(420) z_sampled = srng.normal(size = mean.shape, avg = 0.0, std = 1.0) z_extra = 0.0 * srng.normal(size = mean.shape, avg = 0.0, std = 1.0) z_reconstruction = mean #z_var = std**2 z_loss = 0.0 * T.sum(mean)#0.001 * 0.5 * T.sum(mean**2 + z_var - T.log(z_var) - 1.0) dec_reconstruction = decoder(z_reconstruction, z_extra, params, config) dec_sampled = decoder(z_sampled, z_extra, params, config) interp_lst = [] for j in range(0,128): interp_lst.append(z_reconstruction[0] * (j/128.0) + z_reconstruction[-1] * (1 - j / 128.0)) z_interp = T.concatenate([interp_lst], axis = 1) dec_interp = decoder(z_interp, z_extra, params, config) results_map = {'reconstruction' : dec_reconstruction['h'], 'z_loss' : z_loss, 'sample' : dec_sampled['h'], 'interp' : dec_interp['h'], 'z' : z_reconstruction} return results_map
def __init__(self, num_hidden, num_features, mb_size, hidden_state_features, target): self.mb_size = mb_size #self.seq_length = seq_length #using 0.8 hidden_state_features = dropout(hidden_state_features, 1.0) gru_params_1 = init_tparams( param_init_gru(None, {}, prefix="gru1", dim=num_hidden, nin=num_features)) gru_params_2 = init_tparams( param_init_gru(None, {}, prefix="gru2", dim=num_hidden, nin=num_hidden + num_features)) gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix='gru1', gradient_steps=100)[0] gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis=2), None, prefix='gru2', backwards=True, gradient_steps=100)[0] self.gru_1_out = gru_1_out final_out_recc = T.mean(gru_2_out, axis=0) h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units=num_hidden, nonlinearity=lasagne.nonlinearities.rectify) h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units=num_hidden, nonlinearity=lasagne.nonlinearities.rectify) h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units=1, nonlinearity=None) h_out_1_value = dropout(h_out_1.get_output_for(final_out_recc), 1.0) h_out_2_value = dropout(h_out_2.get_output_for(h_out_1_value), 1.0) h_out_4_value = h_out_4.get_output_for(h_out_2_value) raw_y = T.clip(h_out_4_value, -10.0, 10.0) classification = T.nnet.sigmoid(raw_y) self.accuracy = T.mean( T.eq(target, T.gt(classification, 0.5).flatten())) p_real = classification[0:mb_size] p_gen = classification[mb_size:mb_size * 2] self.d_cost_real = bce(p_real, T.ones(p_real.shape)).mean() self.d_cost_gen = bce(p_gen, T.zeros(p_gen.shape)).mean() self.g_cost_real = bce(p_real, T.zeros(p_gen.shape)).mean() self.g_cost_gen = bce(p_gen, T.ones(p_real.shape)).mean() #self.g_cost = self.g_cost_gen self.g_cost = self.g_cost_real + self.g_cost_gen print "pulling both TF and PF togeher" self.d_cost = self.d_cost_real + self.d_cost_gen #if d_cost < 1.0, use g cost. self.d_cost = T.switch( T.gt(self.accuracy, 0.95) * T.gt(p_real.mean(), 0.99) * T.lt(p_gen.mean(), 0.01), 0.0, self.d_cost) ''' gX = gen(Z, *gen_params) p_real = discrim(X, *discrim_params) p_gen = discrim(gX, *discrim_params) d_cost_real = bce(p_real, T.ones(p_real.shape)).mean() d_cost_gen = bce(p_gen, T.zeros(p_gen.shape)).mean() g_cost_d = bce(p_gen, T.ones(p_gen.shape)).mean() d_cost = d_cost_real + d_cost_gen g_cost = g_cost_d cost = [g_cost, d_cost, g_cost_d, d_cost_real, d_cost_gen] d_updates = d_updater(discrim_params, d_cost) g_updates = g_updater(gen_params, g_cost) ''' self.classification = classification self.params = [] self.params += lasagne.layers.get_all_params(h_out_4, trainable=True) self.params += lasagne.layers.get_all_params(h_out_1, trainable=True) self.params += lasagne.layers.get_all_params(h_out_2, trainable=True) #self.params += h_out_1.getParams() + h_out_2.getParams() + h_out_3.getParams() # self.params += lasagne.layers.get_all_params(h_initial_1,trainable=True) # self.params += lasagne.layers.get_all_params(h_initial_2,trainable=True) self.params += gru_params_1.values() self.params += gru_params_2.values() ''' layerParams = c1.getParams() for paramKey in layerParams: self.params += [layerParams[paramKey]] layerParams = c2.getParams() for paramKey in layerParams: self.params += [layerParams[paramKey]] layerParams = c3.getParams() for paramKey in layerParams: self.params += [layerParams[paramKey]] ''' #all_grads = T.grad(self.loss, self.params) #for j in range(0, len(all_grads)): # all_grads[j] = T.switch(T.isnan(all_grads[j]), T.zeros_like(all_grads[j]), all_grads[j]) #self.updates = lasagne.updates.adam(all_grads, self.params, learning_rate = 0.0001, beta1 = 0.5) '''
class Head(Layer): r""" The base class :class:`Head` represents a generic head for the Neural Turing Machine. The heads are responsible for the read/write operations on the memory. An instance of :class:`Head` outputs a weight vector defined by .. math :: \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\ k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\ \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\ g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\ s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\ \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma}) .. math :: w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\ w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\ \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\ w_{t} \propto \tilde{w}_{t}^{\gamma_{t}} Parameters ---------- controller: a :class:`Controller` instance The controller of the Neural Turing Machine. num_shifts: int Number of shifts allowed by the convolutional shift operation (centered on 0, eg. ``num_shifts=3`` represents shifts in [-1, 0, 1]). memory_shape: tuple Shape of the NTM's memory W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the weights for the parameter :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape ``(controller.num_units, memory_shape[1])``. b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the biases for the parameter :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix with shape ``(memory_shape[1],)``. nonlinearity_sign: callable or ``None`` The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If ``None``, the nonlinearity is ``identity``. W_hid_to_key: callable, Numpy array or Theano shared variable b_hid_to_key: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_key: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`k_{t}`. W_hid_to_beta: callable, Numpy array or Theano shared variable b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_beta: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\beta_{t}`. W_hid_to_gate: callable, Numpy array or Theano shared variable b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gate: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`g_{t}`. W_hid_to_shift: callable, Numpy array or Theano shared variable b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_shift: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`s_{t}`. W_hid_to_gamma: callable, Numpy array or Theano shared variable b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gamma: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\gamma_{t}` weights_init: callable, Numpy array or Theano shared variable Initializer for the initial weight vector (:math:`w_{0}`). learn_init: bool If ``True``, initial hidden values are learned. """ def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x), weights_init=init.OneHot(), learn_init=False, **kwargs): super(Head, self).__init__(controller, **kwargs) self.memory_shape = memory_shape self.basename = kwargs.get('name', 'head') self.learn_init = learn_init if W_hid_to_sign is not None: self.sign = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign, name=self.basename + '.sign') self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b else: self.sign = None self.W_hid_to_sign, self.b_hid_to_sign = None, None self.key = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key, name=self.basename + '.key') self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b self.beta = DenseLayer(controller, num_units=1, W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta, name=self.basename + '.beta') self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b self.gate = DenseLayer(controller, num_units=1, W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate, name=self.basename + '.gate') self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b self.num_shifts = num_shifts self.shift = DenseLayer(controller, num_units=num_shifts, W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift, name=self.basename + '.shift') self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b self.gamma = DenseLayer(controller, num_units=1, W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma, name=self.basename + '.gamma') self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b self.weights_init = self.add_param( weights_init, (1, self.memory_shape[0]), name='weights_init', trainable=learn_init, regularizable=False) def get_output_for(self, h_t, w_tm1, M_t, **kwargs): if self.sign is not None: sign_t = self.sign.get_output_for(h_t, **kwargs) else: sign_t = 1. k_t = self.key.get_output_for(h_t, **kwargs) beta_t = self.beta.get_output_for(h_t, **kwargs) g_t = self.gate.get_output_for(h_t, **kwargs) s_t = self.shift.get_output_for(h_t, **kwargs) gamma_t = self.gamma.get_output_for(h_t, **kwargs) # Content Adressing (3.3.1) beta_t = T.addbroadcast(beta_t, 1) betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 1) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d(w_g_padded, conv_filter, input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(self.input_shape[0], 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[:, 0, 0, :] # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 1) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w) return w def get_params(self, **tags): params = super(Head, self).get_params(**tags) if self.sign is not None: params += self.sign.get_params(**tags) params += self.key.get_params(**tags) params += self.beta.get_params(**tags) params += self.gate.get_params(**tags) params += self.shift.get_params(**tags) params += self.gamma.get_params(**tags) return params
def __init__(self, num_hidden, num_features, seq_length, mb_size, tf_states, rf_states): tf_states = T.specify_shape(tf_states, (seq_length, mb_size, num_features)) rf_states = T.specify_shape(rf_states, (seq_length, mb_size, num_features)) hidden_state_features = T.specify_shape( T.concatenate([tf_states, rf_states], axis=1), (seq_length, mb_size * 2, num_features)) gru_params_1 = init_tparams( param_init_gru(None, {}, prefix="gru1", dim=num_hidden, nin=num_features)) #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features)) #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features)) gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix='gru1')[0] #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0] #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0] final_out_recc = T.specify_shape(T.mean(gru_1_out, axis=0), (mb_size * 2, num_hidden)) h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units=num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units=1, nonlinearity=None) h_out_1_value = h_out_1.get_output_for(final_out_recc) h_out_4_value = h_out_4.get_output_for(h_out_1_value) raw_y = h_out_4_value #raw_y = T.clip(h_out_4_value, -10.0, 10.0) classification = T.nnet.sigmoid(raw_y) #tf comes before rf. p_real = classification[:mb_size] p_gen = classification[mb_size:] #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r)) self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean() self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean() self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean() self.d_cost = self.d_cost_real + self.d_cost_gen self.g_cost = self.g_cost_d self.classification = classification self.params = [] self.params += lasagne.layers.get_all_params(h_out_4, trainable=True) #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True) #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True) self.params += lasagne.layers.get_all_params(h_out_1, trainable=True) self.params += gru_params_1.values() #self.params += gru_params_2.values() #self.params += gru_params_3.values() self.accuracy = T.mean( T.eq(T.ones(p_real.shape).flatten(), T.gt(p_real, 0.5).flatten())) + T.mean( T.eq( T.ones(p_gen.shape).flatten(), T.lt(p_gen, 0.5).flatten()))
class Head(Layer): r""" The base class :class:`Head` represents a generic head for the Neural Turing Machine. The heads are responsible for the read/write operations on the memory. An instance of :class:`Head` outputs a weight vector defined by .. math :: \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\ k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\ \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\ g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\ s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\ \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma}) .. math :: w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\ w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\ \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\ w_{t} \propto \tilde{w}_{t}^{\gamma_{t}} Parameters ---------- controller: a :class:`Controller` instance The controller of the Neural Turing Machine. num_shifts: int Number of shifts allowed by the convolutional shift operation (centered on 0, eg. ``num_shifts=3`` represents shifts in [-1, 0, 1]). memory_shape: tuple Shape of the NTM's memory W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the weights for the parameter :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape ``(controller.num_units, memory_shape[1])``. b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the biases for the parameter :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix with shape ``(memory_shape[1],)``. nonlinearity_sign: callable or ``None`` The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If ``None``, the nonlinearity is ``identity``. W_hid_to_key: callable, Numpy array or Theano shared variable b_hid_to_key: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_key: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`k_{t}`. W_hid_to_beta: callable, Numpy array or Theano shared variable b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_beta: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\beta_{t}`. W_hid_to_gate: callable, Numpy array or Theano shared variable b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gate: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`g_{t}`. W_hid_to_shift: callable, Numpy array or Theano shared variable b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_shift: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`s_{t}`. W_hid_to_gamma: callable, Numpy array or Theano shared variable b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gamma: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\gamma_{t}` weights_init: callable, Numpy array or Theano shared variable Initializer for the initial weight vector (:math:`w_{0}`). learn_init: bool If ``True``, initial hidden values are learned. """ def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities. rectify(x), weights_init=init.OneHot(), learn_init=False, **kwargs): super(Head, self).__init__(controller, **kwargs) self.memory_shape = memory_shape self.basename = kwargs.get('name', 'head') self.learn_init = learn_init if W_hid_to_sign is not None: self.sign = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign, name=self.basename + '.sign') self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b else: self.sign = None self.W_hid_to_sign, self.b_hid_to_sign = None, None self.key = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key, name=self.basename + '.key') self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b self.beta = DenseLayer(controller, num_units=1, W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta, name=self.basename + '.beta') self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b self.gate = DenseLayer(controller, num_units=1, W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate, name=self.basename + '.gate') self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b self.num_shifts = num_shifts self.shift = DenseLayer(controller, num_units=num_shifts, W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift, name=self.basename + '.shift') self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b self.gamma = DenseLayer(controller, num_units=1, W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma, name=self.basename + '.gamma') self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b self.weights_init = self.add_param(weights_init, (1, self.memory_shape[0]), name='weights_init', trainable=learn_init, regularizable=False) def get_output_for(self, h_t, w_tm1, M_t, **kwargs): if self.sign is not None: sign_t = self.sign.get_output_for(h_t, **kwargs) else: sign_t = 1. k_t = self.key.get_output_for(h_t, **kwargs) beta_t = self.beta.get_output_for(h_t, **kwargs) g_t = self.gate.get_output_for(h_t, **kwargs) s_t = self.shift.get_output_for(h_t, **kwargs) gamma_t = self.gamma.get_output_for(h_t, **kwargs) # Content Adressing (3.3.1) beta_t = T.addbroadcast(beta_t, 1) betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 1) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d( w_g_padded, conv_filter, input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(self.input_shape[0], 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[:, 0, 0, :] # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 1) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w) return w def get_params(self, **tags): params = super(Head, self).get_params(**tags) if self.sign is not None: params += self.sign.get_params(**tags) params += self.key.get_params(**tags) params += self.beta.get_params(**tags) params += self.gate.get_params(**tags) params += self.shift.get_params(**tags) params += self.gamma.get_params(**tags) return params
def __init__(self, number_words, num_hidden, seq_length, mb_size): self.mb_size = mb_size x = T.imatrix() #sequence x minibatch x index one_hot_input = T.ftensor3() use_one_hot_input_flag = T.scalar() self.indices = x self.use_one_hot_input_flag = use_one_hot_input_flag self.one_hot_input = one_hot_input ''' flag for input: one-hot or index. If index, compute one-hot and use that. If one-hot, just use one-hot input. ''' #Time seq x examples x words target = T.ivector() #word_embeddings = theano.shared(np.random.normal(size = ((number_words, 1, num_hidden))).astype('float32')) word_embeddings = theano.shared(np.random.normal(size = ((number_words, num_hidden))).astype('float32')) feature_lst = [] for i in range(0, seq_length): #feature = word_embeddings[x[:,i]] #instead of this, multiply by one-hot matrix one_hot = T.extra_ops.to_one_hot(x[:,i], number_words) #W : 30k x 1 x 400 #one_hot: 128 x 30k #one_hot * W #128 x 1 x 400 one_hot_use = ifelse(use_one_hot_input_flag, one_hot_input[i], T.extra_ops.to_one_hot(x[:,i], number_words)) feature = T.reshape(T.dot(one_hot_use, word_embeddings), (1,mb_size,num_hidden)).transpose(1,0,2) feature_lst.append(feature) features = T.concatenate(feature_lst, 1) #example x sequence_position x feature l_lstm_1 = LSTMLayer((seq_length, mb_size, num_hidden), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0) l_lstm_2 = LSTMLayer((seq_length, mb_size, num_hidden * 2), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0, backwards = True) l_lstm_3 = LSTMLayer((seq_length, mb_size, num_hidden * 2), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0) lstm_1_out = l_lstm_1.get_output_for([features]) lstm_2_out = l_lstm_2.get_output_for([T.concatenate([lstm_1_out, features], axis = 2)]) lstm_3_out = l_lstm_3.get_output_for([T.concatenate([lstm_2_out, features], axis = 2)]) final_out = T.mean(lstm_3_out, axis = 1) #final_out = T.mean(features, axis = 1) h_out_1 = DenseLayer((mb_size, num_hidden), num_units = 2048, nonlinearity=lasagne.nonlinearities.rectify) h_out_2 = DenseLayer((mb_size, 2048), num_units = 2048, nonlinearity=lasagne.nonlinearities.rectify) h_out_3 = DenseLayer((mb_size, 2048), num_units = 1, nonlinearity=None) h_out_1_value = h_out_1.get_output_for(final_out) h_out_2_value = h_out_2.get_output_for(h_out_1_value) h_out_3_value = h_out_3.get_output_for(h_out_2_value) classification = T.nnet.sigmoid(h_out_3_value) self.loss = T.mean(T.nnet.binary_crossentropy(output = classification.flatten(), target = target)) self.params = lasagne.layers.get_all_params(h_out_1,trainable=True) + lasagne.layers.get_all_params(h_out_3,trainable=True) + [word_embeddings] + lasagne.layers.get_all_params(l_lstm_1, trainable = True) + lasagne.layers.get_all_params(l_lstm_2, trainable = True) self.params += lasagne.layers.get_all_params(h_out_2,trainable=True) self.params += lasagne.layers.get_all_params(l_lstm_3,trainable=True) all_grads = T.grad(self.loss, self.params) for j in range(0, len(all_grads)): all_grads[j] = T.switch(T.isnan(all_grads[j]), T.zeros_like(all_grads[j]), all_grads[j]) scaled_grads = lasagne.updates.total_norm_constraint(all_grads, 5.0) updates = lasagne.updates.adam(scaled_grads, self.params) self.train_func = theano.function(inputs = [x, target, use_one_hot_input_flag, one_hot_input], outputs = {'l' : self.loss, 'c' : classification, 'g_w' : T.sum(T.sqr(T.grad(self.loss, word_embeddings)))}, updates = updates) self.evaluate_func = theano.function(inputs = [x, use_one_hot_input_flag, one_hot_input], outputs = {'c' : classification})