def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x), W_hid_to_erase=lasagne.init.GlorotUniform(), b_hid_to_erase=lasagne.init.Constant(0.), nonlinearity_erase=nonlinearities.hard_sigmoid, W_hid_to_add=lasagne.init.GlorotUniform(), b_hid_to_add=lasagne.init.Constant(0.), nonlinearity_add=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_sign_add=None, b_hid_to_sign_add=lasagne.init.Constant(0.), nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1., high=1.), weights_init=init.OneHot(), learn_init=False, **kwargs): super(WriteHead, self).__init__(controller, num_shifts=num_shifts, memory_shape=memory_shape, W_hid_to_sign=W_hid_to_sign, b_hid_to_sign=b_hid_to_sign, nonlinearity_sign=nonlinearity_sign, W_hid_to_key=W_hid_to_key, b_hid_to_key=b_hid_to_key, nonlinearity_key=nonlinearity_key, W_hid_to_beta=W_hid_to_beta, b_hid_to_beta=b_hid_to_beta, nonlinearity_beta=nonlinearity_beta, W_hid_to_gate=W_hid_to_gate, b_hid_to_gate=b_hid_to_gate, nonlinearity_gate=nonlinearity_gate, W_hid_to_shift=W_hid_to_shift, b_hid_to_shift=b_hid_to_shift, nonlinearity_shift=nonlinearity_shift, W_hid_to_gamma=W_hid_to_gamma, b_hid_to_gamma=b_hid_to_gamma, nonlinearity_gamma=nonlinearity_gamma, weights_init=weights_init, learn_init=learn_init, **kwargs) self.erase = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_erase, b=b_hid_to_erase, nonlinearity=nonlinearity_erase, name=self.basename + '.erase') self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b self.add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_add, b=b_hid_to_add, nonlinearity=nonlinearity_add, name=self.basename + '.add') self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b if W_hid_to_sign_add is not None: self.sign_add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign_add, b=b_hid_to_sign_add, nonlinearity=nonlinearity_sign_add, name=self.basename + '.sign_add') self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b else: self.sign_add = None self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None
def __init__(self, num_hidden, num_features, seq_length, mb_size, tf_states, rf_states): tf_states = T.specify_shape(tf_states, (seq_length, mb_size, num_features)) rf_states = T.specify_shape(rf_states, (seq_length, mb_size, num_features)) hidden_state_features = T.specify_shape(T.concatenate([tf_states, rf_states], axis = 1), (seq_length, mb_size * 2, num_features)) gru_params_1 = init_tparams(param_init_gru(None, {}, prefix = "gru1", dim = num_hidden, nin = num_features)) #gru_params_2 = init_tparams(param_init_gru(None, {}, prefix = "gru2", dim = num_hidden, nin = num_hidden + num_features)) #gru_params_3 = init_tparams(param_init_gru(None, {}, prefix = "gru3", dim = num_hidden, nin = num_hidden + num_features)) gru_1_out = gru_layer(gru_params_1, hidden_state_features, None, prefix = 'gru1')[0] #gru_2_out = gru_layer(gru_params_2, T.concatenate([gru_1_out, hidden_state_features], axis = 2), None, prefix = 'gru2', backwards = True)[0] #gru_3_out = gru_layer(gru_params_3, T.concatenate([gru_2_out, hidden_state_features], axis = 2), None, prefix = 'gru3')[0] final_out_recc = T.specify_shape(T.mean(gru_1_out, axis = 0), (mb_size * 2, num_hidden)) h_out_1 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_2 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) #h_out_3 = DenseLayer((mb_size * 2, num_hidden), num_units = num_hidden, nonlinearity=lasagne.nonlinearities.rectify) h_out_4 = DenseLayer((mb_size * 2, num_hidden), num_units = 1, nonlinearity=None) h_out_1_value = h_out_1.get_output_for(final_out_recc) h_out_4_value = h_out_4.get_output_for(h_out_1_value) raw_y = h_out_4_value #raw_y = T.clip(h_out_4_value, -10.0, 10.0) classification = T.nnet.sigmoid(raw_y) #tf comes before rf. p_real = classification[:mb_size] p_gen = classification[mb_size:] #bce = lambda r,t: t * T.nnet.softplus(-r) + (1 - t) * (r + T.nnet.softplus(-r)) self.d_cost_real = bce(p_real, 0.9 * T.ones(p_real.shape)).mean() self.d_cost_gen = bce(p_gen, 0.1 + T.zeros(p_gen.shape)).mean() self.g_cost_d = bce(p_gen, 0.9 * T.ones(p_gen.shape)).mean() self.d_cost = self.d_cost_real + self.d_cost_gen self.g_cost = self.g_cost_d self.classification = classification self.params = [] self.params += lasagne.layers.get_all_params(h_out_4,trainable=True) #self.params += lasagne.layers.get_all_params(h_out_3,trainable=True) #self.params += lasagne.layers.get_all_params(h_out_2,trainable=True) self.params += lasagne.layers.get_all_params(h_out_1,trainable=True) self.params += gru_params_1.values() #self.params += gru_params_2.values() #self.params += gru_params_3.values() self.accuracy = T.mean(T.eq(T.ones(p_real.shape).flatten(), T.gt(p_real, 0.5).flatten())) + T.mean(T.eq(T.ones(p_gen.shape).flatten(), T.lt(p_gen, 0.5).flatten()))
def discriminator(x, z, params, mb_size, num_hidden, num_latent): x_z = T.concatenate([x,z], axis = 1) h_out_1 = DenseLayer((mb_size, num_hidden + num_latent), num_units = num_hidden, nonlinearity=None, W = params['W_disc_1']) h_out_2 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_2']) h_out_3 = DenseLayer((mb_size, num_hidden), num_units = num_hidden, nonlinearity=None, W = params['W_disc_3']) h_out_4 = DenseLayer((mb_size, 1), num_units = 1, nonlinearity=None, W = params['W_disc_4'], b = params['b_disc_4']) h_out_1_value = h_out_1.get_output_for(x_z) h_out_1_value = T.maximum(0.0, (h_out_1_value - T.mean(h_out_1_value, axis = 0)) / (1.0 + T.std(h_out_1_value, axis = 0)) + params['b_disc_1']) h_out_2_value = h_out_2.get_output_for(h_out_1_value) h_out_2_value = T.maximum(0.0, (h_out_2_value - T.mean(h_out_2_value, axis = 0)) / (1.0 + T.std(h_out_2_value, axis = 0)) + params['b_disc_2']) h_out_3_value = h_out_3.get_output_for(h_out_2_value) h_out_3_value = T.maximum(0.0, (h_out_3_value - T.mean(h_out_3_value, axis = 0)) / (1.0 + T.std(h_out_3_value, axis = 0)) + params['b_disc_3']) h_out_4_value = h_out_4.get_output_for(h_out_3_value) raw_y = h_out_4_value classification = T.nnet.sigmoid(raw_y) results = {'c' : classification} return results
def __init__(self, n_inputs, n_outputs, regression, multiclass=False, depth=5, n_estimators=20, n_hidden=128, learning_rate=0.01, num_epochs=500, pi_iters=20, sgd_iters=10, batch_size=1000, momentum=0.0, dropout=0.0, loss=None, update=adagrad): """ Parameters ---------- n_inputs : number of input features n_outputs : number of classes to predict (1 for regression) for 2 class classification n_outputs should be 2, not 1 regression : True for regression, False for classification multiclass : not used depth : depth of each tree in the ensemble n_estimators : number of trees in the ensemble n_hidden : number of neurons in the hidden layer pi_iters : number of iterations for the iterative algorithm that updates pi sgd_iters : number of full iterations of sgd between two consequtive updates of pi loss : theano loss function. If None, squared error will be used for regression and cross entropy will be used for classification update : theano update function """ self._depth = depth self._n_estimators = n_estimators self._n_hidden = n_hidden self._n_outputs = n_outputs self._loss = loss self._regression = regression self._multiclass = multiclass self._learning_rate = learning_rate self._num_epochs = num_epochs self._pi_iters = pi_iters self._sgd_iters = sgd_iters self._batch_size = batch_size self._momentum = momentum self._update = update self.t_input = T.matrix('input') self.t_label = T.matrix('output') self._cached_trainable_params = None self._cached_params = None self._n_net_out = n_estimators * ((1 << depth) - 1) self.l_input = InputLayer((None, n_inputs)) self.l_dense1 = DenseLayer(self.l_input, self._n_hidden, nonlinearity=rectify) if dropout != 0: self.l_dense1 = DropoutLayer(self.l_dense1, p=dropout) if not __DEBUG_NO_FOREST__: self.l_dense2 = DenseLayer(self.l_dense1, self._n_net_out, nonlinearity=sigmoid) self.l_forest = NeuralForestLayer(self.l_dense2, self._depth, self._n_estimators, self._n_outputs, self._pi_iters) else: self.l_forest = DenseLayer(self.l_dense1, self._n_outputs, nonlinearity=softmax)
def decoder(z, params, config): mb_size = config['mb_size'] num_latent = config['num_latent'] num_hidden = config['num_hidden'] h_out_1 = HiddenLayer(num_in = num_latent, num_out = num_hidden, W = params['W_dec_1'], b = params['b_dec_1'], activation = 'relu', batch_norm = True) h_out_2 = HiddenLayer(num_in = num_hidden, num_out = num_hidden, W = params['W_dec_2'], b = params['b_dec_2'], activation = 'relu', batch_norm = True) h_out_3 = DenseLayer((mb_size, num_hidden), num_units = 4000, nonlinearity=None, W = params['W_dec_3'], b = params['b_dec_3']) h_out_1_value = h_out_1.output(z) h_out_2_value = h_out_2.output(h_out_1_value) h_out_3_value = h_out_3.get_output_for(h_out_2_value) return {'h' : h_out_3_value}
def define_network(x, params, config): num_hidden = config['num_hidden'] mb_size = config['mb_size'] num_latent = config['num_latent'] enc = encoder(x, params, config) mean_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_mean_W'], b = params['z_mean_b']) #std_layer = DenseLayer((mb_size, num_hidden), num_units = num_latent, nonlinearity=None, W = params['z_std_W'], b = params['z_std_b']) mean = mean_layer.get_output_for(enc['h']) #std = T.exp(std_layer.get_output_for(enc['h'])) import random as rng srng = theano.tensor.shared_randomstreams.RandomStreams(420) z_sampled = srng.normal(size = mean.shape, avg = 0.0, std = 1.0) z_extra = 0.0 * srng.normal(size = mean.shape, avg = 0.0, std = 1.0) z_reconstruction = mean #z_var = std**2 z_loss = 0.0 * T.sum(mean)#0.001 * 0.5 * T.sum(mean**2 + z_var - T.log(z_var) - 1.0) dec_reconstruction = decoder(z_reconstruction, z_extra, params, config) dec_sampled = decoder(z_sampled, z_extra, params, config) interp_lst = [] for j in range(0,128): interp_lst.append(z_reconstruction[0] * (j/128.0) + z_reconstruction[-1] * (1 - j / 128.0)) z_interp = T.concatenate([interp_lst], axis = 1) dec_interp = decoder(z_interp, z_extra, params, config) results_map = {'reconstruction' : dec_reconstruction['h'], 'z_loss' : z_loss, 'sample' : dec_sampled['h'], 'interp' : dec_interp['h'], 'z' : z_reconstruction} return results_map
def test_get_all_params(self): from lasagne.layers import (InputLayer, DenseLayer, get_all_params) l1 = InputLayer((10, 20)) l2 = DenseLayer(l1, 30) l3 = DenseLayer(l2, 40) assert get_all_params(l3) == l2.get_params() + l3.get_params() assert (get_all_params(l3, regularizable=False) == (l2.get_params(regularizable=False) + l3.get_params(regularizable=False))) assert (get_all_params(l3, regularizable=True) == (l2.get_params(regularizable=True) + l3.get_params(regularizable=True)))
def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x), weights_init=init.OneHot(), learn_init=False, **kwargs): super(Head, self).__init__(controller, **kwargs) self.memory_shape = memory_shape self.basename = kwargs.get('name', 'head') self.learn_init = learn_init if W_hid_to_sign is not None: self.sign = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign, name=self.basename + '.sign') self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b else: self.sign = None self.W_hid_to_sign, self.b_hid_to_sign = None, None self.key = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key, name=self.basename + '.key') self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b self.beta = DenseLayer(controller, num_units=1, W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta, name=self.basename + '.beta') self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b self.gate = DenseLayer(controller, num_units=1, W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate, name=self.basename + '.gate') self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b self.num_shifts = num_shifts self.shift = DenseLayer(controller, num_units=num_shifts, W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift, name=self.basename + '.shift') self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b self.gamma = DenseLayer(controller, num_units=1, W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma, name=self.basename + '.gamma') self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b self.weights_init = self.add_param( weights_init, (1, self.memory_shape[0]), name='weights_init', trainable=learn_init, regularizable=False)
class WriteHead(Head): r""" Write head. In addition to the weight vector, the write head also outputs an add vector :math:`a_{t}` and an erase vector :math:`e_{t}` defined by .. math :: \delta_{t} &= \sigma_{delta}(h_{t} W_{delta} + b_{delta})\\ a_{t} &= \delta_{t} * \sigma_{a}(h_{t} W_{a} + b_{a}) e_{t} &= \sigma_{e}(h_{t} W_{e} + b_{e}) Parameters ---------- controller: a :class:`Controller` instance The controller of the Neural Turing Machine. num_shifts: int Number of shifts allowed by the convolutional shift operation (centered on 0, eg. ``num_shifts=3`` represents shifts in [-1, 0, 1]). memory_shape: tuple Shape of the NTM's memory W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_sign: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\alpha_{t}`. W_hid_to_key: callable, Numpy array or Theano shared variable b_hid_to_key: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_key: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`k_{t}`. W_hid_to_beta: callable, Numpy array or Theano shared variable b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_beta: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\beta_{t}`. W_hid_to_gate: callable, Numpy array or Theano shared variable b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gate: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`g_{t}`. W_hid_to_shift: callable, Numpy array or Theano shared variable b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_shift: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`s_{t}`. W_hid_to_gamma: callable, Numpy array or Theano shared variable b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gamma: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\gamma_{t}` W_hid_to_erase: callable, Numpy array or Theano shared variable b_hid_to_erase: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_erase: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`e_{t}` W_hid_to_add: callable, Numpy array or Theano shared variable b_hid_to_add: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_add: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`a_{t}` W_hid_to_sign_add: callable, Numpy array, Theano shared variable, or ``None`` b_hid_to_sign_add: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_sign_add: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\delta_{t}` weights_init: callable, Numpy array or Theano shared variable Initializer for the initial weight vector (:math:`w_{0}`). learn_init: bool If ``True``, initial hidden values are learned. """ def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x), W_hid_to_erase=lasagne.init.GlorotUniform(), b_hid_to_erase=lasagne.init.Constant(0.), nonlinearity_erase=nonlinearities.hard_sigmoid, W_hid_to_add=lasagne.init.GlorotUniform(), b_hid_to_add=lasagne.init.Constant(0.), nonlinearity_add=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_sign_add=None, b_hid_to_sign_add=lasagne.init.Constant(0.), nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1., high=1.), weights_init=init.OneHot(), learn_init=False, **kwargs): super(WriteHead, self).__init__(controller, num_shifts=num_shifts, memory_shape=memory_shape, W_hid_to_sign=W_hid_to_sign, b_hid_to_sign=b_hid_to_sign, nonlinearity_sign=nonlinearity_sign, W_hid_to_key=W_hid_to_key, b_hid_to_key=b_hid_to_key, nonlinearity_key=nonlinearity_key, W_hid_to_beta=W_hid_to_beta, b_hid_to_beta=b_hid_to_beta, nonlinearity_beta=nonlinearity_beta, W_hid_to_gate=W_hid_to_gate, b_hid_to_gate=b_hid_to_gate, nonlinearity_gate=nonlinearity_gate, W_hid_to_shift=W_hid_to_shift, b_hid_to_shift=b_hid_to_shift, nonlinearity_shift=nonlinearity_shift, W_hid_to_gamma=W_hid_to_gamma, b_hid_to_gamma=b_hid_to_gamma, nonlinearity_gamma=nonlinearity_gamma, weights_init=weights_init, learn_init=learn_init, **kwargs) self.erase = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_erase, b=b_hid_to_erase, nonlinearity=nonlinearity_erase, name=self.basename + '.erase') self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b self.add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_add, b=b_hid_to_add, nonlinearity=nonlinearity_add, name=self.basename + '.add') self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b if W_hid_to_sign_add is not None: self.sign_add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign_add, b=b_hid_to_sign_add, nonlinearity=nonlinearity_sign_add, name=self.basename + '.sign_add') self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b else: self.sign_add = None self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None def get_params(self, **tags): params = super(WriteHead, self).get_params(**tags) params += self.erase.get_params(**tags) params += self.add.get_params(**tags) if self.sign_add is not None: params += self.sign_add.get_params(**tags) return params
def __init__(self, feature_shape, latent_size, hidden_structure, reconstruction_distribution=None, number_of_reconstruction_classes=None, use_count_sum=False): self.use_count_sum = use_count_sum and \ (reconstruction_distribution != "bernoulli") print("Setting up model.") print(" feature size: {}".format(feature_shape)) print(" latent size: {}".format(latent_size)) print(" hidden structure: {}".format(", ".join( map(str, hidden_structure)))) if type(reconstruction_distribution) == str: print(" reconstruction distribution: " + reconstruction_distribution) else: print(" reconstruction distribution: custom") if number_of_reconstruction_classes > 0: print( " reconstruction classes: {}".format( number_of_reconstruction_classes), " (including 0s)") if self.use_count_sum: print(" using count sums") print("") # Setup super(VariationalAutoEncoderForCounts, self).__init__() self.feature_shape = feature_shape self.latent_size = latent_size self.hidden_structure = hidden_structure symbolic_x = T.matrix('x') # counts symbolic_z = T.matrix('z') # latent variable self.number_of_epochs_trained = 0 symbolic_learning_rate = T.scalar("epsilon") self.learning_curves = { "training": { "LB": [], "ENRE": [], "KL": [] }, "validation": { "LB": [], "ENRE": [], "KL": [] } } if reconstruction_distribution: if type(reconstruction_distribution) == str: if number_of_reconstruction_classes > 0: reconstruction_distribution = "softmax_" + \ reconstruction_distribution self.k_max = number_of_reconstruction_classes - 1 reconstruction_distribution = \ reconstruction_distributions[reconstruction_distribution] reconstruction_distribution = \ reconstruction_distribution(self.k_max) else: reconstruction_distribution = \ reconstruction_distributions[reconstruction_distribution] self.x_parameters = reconstruction_distribution["parameters"] self.reconstruction_activation_functions = \ reconstruction_distribution["activation functions"] self.expectedNegativeReconstructionError = \ reconstruction_distribution["function"] self.meanOfReconstructionDistribution = reconstruction_distribution[ "mean"] self.preprocess = reconstruction_distribution["preprocess"] else: reconstruction_distribution = "Gaussian (default)" # Use a Gaussian distribution as standard self.x_parameters = ["mu", "sigma"] self.reconstruction_activation_functions = { "mu": identity, "sigma": identity } self.expectedNegativeReconstructionError = lambda x, x_theta, eps = 0.0: \ log_normal(x, x_theta["mu"], x_theta["sigma"], eps) self.meanOfReconstructionDistribution = lambda x_theta: x_theta[ "mu"] self.preprocess = lambda x: x # if number_of_reconstruction_classes > 0: # # self.x_parameters += ["p_k"] # self.reconstruction_activation_functions["p_k"] = softmax # log_distribution = self.expectedNegativeReconstructionError # self.expectedNegativeReconstructionError = lambda x, x_theta, eps = 0.0: \ # log_cross_entropy_extended(x, x_theta, # log_distribution, k_max = number_of_reconstruction_classes - 1, # eps = 0.0) # mean_of_distribution = self.meanOfReconstructionDistribution # self.meanOfReconstructionDistribution = lambda x_theta: \ # meanOfCrossEntropyExtendedDistibution(x_theta, # mean_of_distribution, k_max = number_of_reconstruction_classes - 1) # self.k_max = number_of_reconstruction_classes - 1 if self.use_count_sum: symbolic_n = T.matrix('n') # sum of counts # Models ## Recognition model q(z|x) l_enc_in = InputLayer(shape=(None, feature_shape), name="ENC_INPUT") l_enc = l_enc_in for i, hidden_size in enumerate(hidden_structure): l_enc = DenseLayer(l_enc, num_units=hidden_size, nonlinearity=rectify, name='ENC_DENSE{:d}'.format(i + 1)) l_z_mu = DenseLayer(l_enc, num_units=latent_size, nonlinearity=None, name='ENC_Z_MU') l_z_log_var = DenseLayer(l_enc, num_units=latent_size, nonlinearity=lambda x: T.clip(x, -10, 10), name='ENC_Z_LOG_VAR') # Sample a latent representation z \sim q(z|x) = N(mu(x), logvar(x)) l_z = SimpleSampleLayer(mean=l_z_mu, log_var=l_z_log_var, name="ENC_SAMPLE") self.encoder = l_z ## Generative model p(x|z) l_dec_z_in = InputLayer(shape=(None, latent_size), name="DEC_INPUT") if self.use_count_sum: l_dec_n_in = InputLayer(shape=(None, 1), name="DEC_N_INPUT") l_dec = ConcatLayer([l_dec_z_in, l_dec_n_in], axis=1, name="DEC_MERGE_INPUT") else: l_dec = l_dec_z_in for i, hidden_size in enumerate(reversed(hidden_structure)): l_dec = DenseLayer( l_dec, num_units=hidden_size, nonlinearity=rectify, name='DEC_DENSE{:d}'.format(len(hidden_structure) - i)) l_x_theta = {} for p in self.x_parameters: p_name = 'DEC_X_' + p.upper() if self.reconstruction_activation_functions[p] == softmax: l_dense = DenseLayer(l_dec, num_units=feature_shape * (self.k_max + 1), nonlinearity=identity, name=p_name + "_DENSE") l_reshape = ReshapeLayer(l_dense, (-1, (self.k_max + 1))) l_softmax = DenseLayer(l_reshape, num_units=(self.k_max + 1), nonlinearity=softmax, name=p_name + "_SOFTMAX") l_x_theta[p] = ReshapeLayer(l_softmax, (-1, feature_shape, (self.k_max + 1))) else: l_x_theta[p] = DenseLayer( l_dec, num_units=feature_shape, nonlinearity=self.reconstruction_activation_functions[p], name=p_name) self.decoder = {p: l_x_theta[p] for p in self.x_parameters} ## Get outputs from models ## Training outputs z_train, z_mu_train, z_log_var_train = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_in: symbolic_x}, deterministic=False) inputs = {l_dec_z_in: z_train} if self.use_count_sum: inputs[l_dec_n_in] = symbolic_n x_theta_train = get_output([l_x_theta[p] for p in self.x_parameters], inputs, deterministic=False) x_theta_train = { p: o for p, o in zip(self.x_parameters, x_theta_train) } ## Evaluation outputs z_eval, z_mu_eval, z_log_var_eval = get_output( [l_z, l_z_mu, l_z_log_var], {l_enc_in: symbolic_x}, deterministic=True) inputs = {l_dec_z_in: z_eval} if self.use_count_sum: inputs[l_dec_n_in] = symbolic_n x_theta_eval = get_output([l_x_theta[p] for p in self.x_parameters], inputs, deterministic=True) x_theta_eval = {p: o for p, o in zip(self.x_parameters, x_theta_eval)} ## Sample outputs inputs = {l_dec_z_in: symbolic_z} if self.use_count_sum: inputs[l_dec_n_in] = symbolic_n x_theta_sample = get_output([l_x_theta[p] for p in self.x_parameters], inputs, deterministic=True) x_theta_sample = { p: o for p, o in zip(self.x_parameters, x_theta_sample) } # Likelihood lower_bound_train, log_p_x_train, KL__train = \ self.lowerBound(symbolic_x, x_theta_train, z_mu_train, z_log_var_train) lower_bound_eval, log_p_x_eval, KL__eval = \ self.lowerBound(symbolic_x, x_theta_eval, z_mu_eval, z_log_var_eval) all_parameters = get_all_params( [l_z] + [l_x_theta[p] for p in self.x_parameters], trainable=True) print("Parameters to train:") for parameter in all_parameters: print(" {}: {}".format(parameter, parameter.get_value().shape)) # Let Theano do its magic and get all the gradients we need for training all_gradients = T.grad(-lower_bound_train, all_parameters) # Set the update function for parameters. The Adam optimizer works really well with VAEs. update_expressions = updates.adam(all_gradients, all_parameters, learning_rate=symbolic_learning_rate) inputs = [symbolic_x] if self.use_count_sum: inputs.append(symbolic_n) inputs.append(symbolic_learning_rate) self.f_train = theano.function( inputs=inputs, outputs=[lower_bound_train, log_p_x_train, KL__train], updates=update_expressions) inputs = [symbolic_x] if self.use_count_sum: inputs.append(symbolic_n) self.f_eval = theano.function( inputs=inputs, outputs=[lower_bound_eval, log_p_x_eval, KL__eval]) self.f_z = theano.function(inputs=[symbolic_x], outputs=[z_eval]) inputs = [symbolic_z] if self.use_count_sum: inputs.append(symbolic_n) self.f_sample = theano.function( inputs=inputs, outputs=[x_theta_sample[p] for p in self.x_parameters]) inputs = [symbolic_x] if self.use_count_sum: inputs.append(symbolic_n) self.f_recon = theano.function( inputs=inputs, outputs=[x_theta_eval[p] for p in self.x_parameters])
return net # Load model weights and metadata d = pickle.load(open('../input/pretrained/vgg19.pkl')) # Build the network and fill with pretrained weights net = build_model() # Define loss function and metrics, and get an updates dictionary X_sym = T.tensor4() y_sym = T.ivector() # We'll connect our output classifier to the last fully connected layer of the network net['new_output'] = DenseLayer(net['drop7'], num_units=8, nonlinearity=softmax, W=lasagne.init.Normal(0.01)) prediction = lasagne.layers.get_output(net['new_output'], X_sym) loss = lasagne.objectives.categorical_crossentropy(prediction, y_sym) loss = loss.mean() acc = T.mean(T.eq(T.argmax(prediction, axis=1), y_sym), dtype=theano.config.floatX) learning_rate = theano.shared(np.array(0.0003, dtype=theano.config.floatX)) learning_rate_decay = np.array(0.3, dtype=theano.config.floatX) updates = OrderedDict() for name, layer in net.items(): layer_params = layer.get_params(trainable=True)
def classif(X, y): l = InputLayer(shape=(None, X.shape[1])) l = DenseLayer(l, num_units=len(np.unique(y)), nonlinearity=softmax) net = NeuralNet(l, update_learning_rate=0.01) net.fit(X, y) print(net.score(X, y))
def test_initialization_with_layer_instance_bad_params(self, NeuralNet): layer = DenseLayer(InputLayer(shape=(128, 13)), num_units=2) nn = NeuralNet(layers=layer, dense1_num_units=3) with pytest.raises(ValueError): nn.initialize_layers()
def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities. rectify(x), W_hid_to_erase=lasagne.init.GlorotUniform(), b_hid_to_erase=lasagne.init.Constant(0.), nonlinearity_erase=nonlinearities.hard_sigmoid, W_hid_to_add=lasagne.init.GlorotUniform(), b_hid_to_add=lasagne.init.Constant(0.), nonlinearity_add=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_sign_add=None, b_hid_to_sign_add=lasagne.init.Constant(0.), nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1., high=1.), weights_init=init.OneHot(), learn_init=False, **kwargs): super(WriteHead, self).__init__(controller, num_shifts=num_shifts, memory_shape=memory_shape, W_hid_to_sign=W_hid_to_sign, b_hid_to_sign=b_hid_to_sign, nonlinearity_sign=nonlinearity_sign, W_hid_to_key=W_hid_to_key, b_hid_to_key=b_hid_to_key, nonlinearity_key=nonlinearity_key, W_hid_to_beta=W_hid_to_beta, b_hid_to_beta=b_hid_to_beta, nonlinearity_beta=nonlinearity_beta, W_hid_to_gate=W_hid_to_gate, b_hid_to_gate=b_hid_to_gate, nonlinearity_gate=nonlinearity_gate, W_hid_to_shift=W_hid_to_shift, b_hid_to_shift=b_hid_to_shift, nonlinearity_shift=nonlinearity_shift, W_hid_to_gamma=W_hid_to_gamma, b_hid_to_gamma=b_hid_to_gamma, nonlinearity_gamma=nonlinearity_gamma, weights_init=weights_init, learn_init=learn_init, **kwargs) self.erase = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_erase, b=b_hid_to_erase, nonlinearity=nonlinearity_erase, name=self.basename + '.erase') self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b self.add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_add, b=b_hid_to_add, nonlinearity=nonlinearity_add, name=self.basename + '.add') self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b if W_hid_to_sign_add is not None: self.sign_add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign_add, b=b_hid_to_sign_add, nonlinearity=nonlinearity_sign_add, name=self.basename + '.sign_add') self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b else: self.sign_add = None self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None
def lasagne_separate(M, P, FE, W1, W2, z1, z2, hh=.0001, ep=5000, d=0, wsp=.0001, plt=True): from paris.signal import bss_eval # Gt dictionary shapes K = [W1.shape[0], W2.shape[0]] # GPU cached data _M = theano.shared(M.astype(float32)) # Input is the learned dictionary set lW = hstack((W1.T, W2.T)).astype(float32) _lW = Th.matrix('_lW') fI = InputLayer(shape=lW.shape, input_var=_lW) # Split in two paths fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1) fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1) # Dropout? dfW1 = DropoutLayer(fW1, d) dfW2 = DropoutLayer(fW2, d) # Compute source modulators R1 = DenseLayer(dfW1, num_units=M.shape[1], nonlinearity=lambda x: psoftplus(x, 3.), b=None) R2 = DenseLayer(dfW2, num_units=M.shape[1], nonlinearity=lambda x: psoftplus(x, 3.), b=None) # Bring to standard orientation R = ElemwiseSumLayer([R1, R2]) # Cost function cost = (_M*(Th.log(_M+eps) - Th.log( get_output( R)+eps)) - _M + get_output( R)).mean() \ + wsp*(Th.mean( abs( R1.W))+Th.mean( abs( R2.W))) # Train it using Lasagne opt = downhill.build('rprop', loss=cost, inputs=[_lW], params=get_all_params(R)) train = downhill.Dataset(lW, batch_size=0) er = downhill_train(opt, train, hh, ep, None)[-1] # Get outputs _r = nget(R, _lW, lW) + eps _r1 = nget(R1, _lW, lW) _r2 = nget(R2, _lW, lW) o1 = FE.ife(_r1 * (M / _r), P) o2 = FE.ife(_r2 * (M / _r), P) sxr = bss_eval(o1, 0, vstack((z1, z2))) + bss_eval(o2, 1, vstack((z1, z2))) return o1, o2, (array(sxr[:3]) + array(sxr[3:])) / 2.
def create_network(available_actions_count): # Create the input variables s1 = tensor.tensor4("State") a = tensor.vector("Action", dtype="int32") q2 = tensor.vector("Q2") r = tensor.vector("Reward") isterminal = tensor.vector("IsTerminal", dtype="int8") # Create the input layer of the network. dqn = InputLayer(shape=[None, 1, resolution[0], resolution[1]], input_var=s1) # Add 2 convolutional layers with ReLu activation dqn = Conv2DLayer(dqn, num_filters=8, filter_size=[6, 6], nonlinearity=rectify, W=HeUniform("relu"), b=Constant(.1), stride=3) dqn = Conv2DLayer(dqn, num_filters=8, filter_size=[3, 3], nonlinearity=rectify, W=HeUniform("relu"), b=Constant(.1), stride=2) # Add a single fully-connected layer. dqn = DenseLayer(dqn, num_units=128, nonlinearity=rectify, W=HeUniform("relu"), b=Constant(.1)) # Add the output layer (also fully-connected). # (no nonlinearity as it is for approximating an arbitrary real function) dqn = DenseLayer(dqn, num_units=available_actions_count, nonlinearity=None) # Define the loss function q = get_output(dqn) # target differs from q only for the selected action. The following means: # target_Q(s,a) = r + gamma * max Q(s2,_) if isterminal else r target_q = tensor.set_subtensor( q[tensor.arange(q.shape[0]), a], r + discount_factor * (1 - isterminal) * q2) loss = squared_error(q, target_q).mean() # Update the parameters according to the computed gradient using RMSProp. params = get_all_params(dqn, trainable=True) updates = rmsprop(loss, params, learning_rate) # Compile the theano functions print("Compiling the network ...") function_learn = theano.function([s1, q2, a, r, isterminal], loss, updates=updates, name="learn_fn") function_get_q_values = theano.function([s1], q, name="eval_fn") function_get_best_action = theano.function([s1], tensor.argmax(q), name="test_fn") print("Network compiled.") def simple_get_best_action(state): return function_get_best_action( state.reshape([1, 1, resolution[0], resolution[1]])) # Returns Theano objects for the net and functions. return dqn, function_learn, function_get_q_values, simple_get_best_action
def _get_l_out(self, input_vars): check_options(self.options) id_tag = (self.id + '/') if self.id else '' prev_output_var, mask_var = input_vars[-2:] color_input_vars = input_vars[:-2] context_len = self.context_len if hasattr(self, 'context_len') else 1 l_color_repr, color_inputs = self.color_vec.get_input_layer( color_input_vars, recurrent_length=self.seq_vec.max_len - 1, cell_size=self.options.speaker_cell_size, context_len=context_len, id=self.id ) l_hidden_color = dimshuffle(l_color_repr, (0, 2, 1)) for i in range(1, self.options.speaker_hidden_color_layers + 1): l_hidden_color = NINLayer( l_hidden_color, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_color%d' % i) l_hidden_color = dimshuffle(l_hidden_color, (0, 2, 1)) l_prev_out = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=prev_output_var, name=id_tag + 'prev_input') l_prev_embed = EmbeddingLayer(l_prev_out, input_size=len(self.seq_vec.tokens), output_size=self.options.speaker_cell_size, name=id_tag + 'prev_embed') l_in = ConcatLayer([l_hidden_color, l_prev_embed], axis=2, name=id_tag + 'color_prev') l_mask_in = InputLayer(shape=(None, self.seq_vec.max_len - 1), input_var=mask_var, name=id_tag + 'mask_input') l_rec_drop = l_in cell = CELLS[self.options.speaker_cell] cell_kwargs = { 'mask_input': (None if self.options.speaker_no_mask else l_mask_in), 'grad_clipping': self.options.speaker_grad_clipping, 'num_units': self.options.speaker_cell_size, } if self.options.speaker_cell == 'LSTM': cell_kwargs['forgetgate'] = Gate(b=Constant(self.options.speaker_forget_bias)) if self.options.speaker_cell != 'GRU': cell_kwargs['nonlinearity'] = NONLINEARITIES[self.options.speaker_nonlinearity] for i in range(1, self.options.speaker_recurrent_layers): l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % i, **cell_kwargs) if self.options.speaker_dropout > 0.0: l_rec_drop = DropoutLayer(l_rec, p=self.options.speaker_dropout, name=id_tag + 'rec%d_drop' % i) else: l_rec_drop = l_rec l_rec = cell(l_rec_drop, name=id_tag + 'rec%d' % self.options.speaker_recurrent_layers, **cell_kwargs) l_shape = ReshapeLayer(l_rec, (-1, self.options.speaker_cell_size), name=id_tag + 'reshape') l_hidden_out = l_shape for i in range(1, self.options.speaker_hidden_out_layers + 1): l_hidden_out = DenseLayer( l_hidden_out, num_units=self.options.speaker_cell_size, nonlinearity=NONLINEARITIES[self.options.speaker_nonlinearity], name=id_tag + 'hidden_out%d' % i) l_softmax = DenseLayer(l_hidden_out, num_units=len(self.seq_vec.tokens), nonlinearity=softmax, name=id_tag + 'softmax') l_out = ReshapeLayer(l_softmax, (-1, self.seq_vec.max_len - 1, len(self.seq_vec.tokens)), name=id_tag + 'out') return l_out, color_inputs + [l_prev_out, l_mask_in]
def _build_disc(self): inputs = OrderedDict() inputs['x'] = InputLayer((None, 4, 64, 64)) inputs['c'] = InputLayer((None, 843)) inputs['v'] = InputLayer((None, 4)) inputs['t'] = InputLayer((None, 8)) layer_c = inputs['c'] layer_c = DenseLayer(layer_c, 512, nonlinearity=leaky_rectify) layer_c.params[layer_c.W].add('dense') layer_c = (DenseLayer(layer_c, 512, nonlinearity=leaky_rectify)) layer_c.params[layer_c.W].add('dense') layer_v = inputs['v'] layer_v = DenseLayer(layer_v, 512, nonlinearity=leaky_rectify) layer_v.params[layer_v.W].add('dense') layer_v = (DenseLayer(layer_v, 512, nonlinearity=leaky_rectify)) layer_v.params[layer_v.W].add('dense') layer_t = inputs['t'] layer_t = DenseLayer(layer_t, 512, nonlinearity=leaky_rectify) layer_t.params[layer_t.W].add('dense') layer_t = (DenseLayer(layer_t, 512, nonlinearity=leaky_rectify)) layer_t.params[layer_t.W].add('dense') layer_i = ConcatLayer([layer_c, layer_v, layer_t]) layer_i = DenseLayer(layer_i, 1024, nonlinearity=leaky_rectify) layer_i.params[layer_i.W].add('dense') layer_i = DenseLayer(layer_i, 1024, nonlinearity=None) layer_i.params[layer_i.W].add('dense') layer_x = inputs['x'] layer_x_n = layer_x layer_x = weight_norm(Conv2DLayer(layer_x_n, 64, 5, 2, 'same', nonlinearity=None, b=None)) if self.reg: layer_x = dropout(layer_x) layer_x = NonlinearityLayer(layer_x, leaky_rectify) layer_x = weight_norm(Conv2DLayer(layer_x, 64, 5, 2, 'same', nonlinearity=None, b=None)) if self.reg: layer_x = dropout(layer_x) layer_x = NonlinearityLayer(layer_x, leaky_rectify) layer_x = weight_norm(Conv2DLayer(layer_x, 128, 5, 2, 'same', nonlinearity=None, b=None)) if self.reg: layer_x = dropout(layer_x) layer_x = NonlinearityLayer(layer_x, leaky_rectify) layer_x = weight_norm(Conv2DLayer(layer_x, 256, 5, 2, 'same', nonlinearity=None, b=None)) layer_x = NonlinearityLayer(layer_x, leaky_rectify) layer_x = FlattenLayer(layer_x) layer_x = DenseLayer(layer_x, 1024, nonlinearity=leaky_rectify) layer_x.params[layer_x.W].add('dense') layer_x = DenseLayer(layer_x, 1024, nonlinearity=None) layer_x.params[layer_x.W].add('dense') layer = ElemwiseMergeLayer([layer_i, layer_x], T.mul) layer = ConcatLayer([layer, layer_x, layer_i]) layer = DenseLayer(layer, 1024, nonlinearity=leaky_rectify) layer.params[layer.W].add('dense') layer_s = layer layer_s = DenseLayer(layer_s, 1, nonlinearity=None) layer_s.params[layer_s.W].add('dense') layer_s_0 = NonlinearityLayer(layer_s, nonlinearity=sigmoid) layer_s_1 = NonlinearityLayer(layer_s, nonlinearity=lambda x: x - T.log(1 + T.exp(x))) layer_s_2 = NonlinearityLayer(layer_s, nonlinearity=lambda x: -T.log(1 + T.exp(x))) outputs = OrderedDict() outputs['s'] = layer_s_0 outputs['log(s)'] = layer_s_1 outputs['log(1-s)'] = layer_s_2 self.disc_inputs = inputs self.disc_outputs = outputs self.disc_inputs = inputs self.disc_outputs = outputs
def _build_gen(self): size = 64 s, s2, s4, s8, s16 = size, size // 2, size // 4, size // 8, size // 16 inputs = OrderedDict() inputs['c'] = InputLayer((None, 843)) inputs['v'] = InputLayer((None, 4)) inputs['t'] = InputLayer((None, 8)) layer_c = inputs['c'] layer_c = DenseLayer(layer_c, 512, nonlinearity=rectify) layer_c.params[layer_c.W].add('dense') layer_c = DenseLayer(layer_c, 512, nonlinearity=rectify) layer_c.params[layer_c.W].add('dense') layer_v = inputs['v'] layer_v = DenseLayer(layer_v, 512, nonlinearity=rectify) layer_v.params[layer_v.W].add('dense') layer_v = DenseLayer(layer_v, 512, nonlinearity=rectify) layer_v.params[layer_v.W].add('dense') layer_t = inputs['t'] layer_t = DenseLayer(layer_t, 512, nonlinearity=rectify) layer_t.params[layer_t.W].add('dense') layer_t = DenseLayer(layer_t, 512, nonlinearity=rectify) layer_t.params[layer_t.W].add('dense') layer = ConcatLayer([layer_c, layer_v, layer_t]) layer = DenseLayer(layer, 1024, nonlinearity=rectify) layer.params[layer.W].add('dense') layer = DenseLayer(layer, 1024, nonlinearity=rectify) layer.params[layer.W].add('dense') layer = DenseLayer(layer, 768 * s16 * s16, nonlinearity=rectify) layer.params[layer.W].add('dense') layer = ReshapeLayer(layer, (-1, 768, s16, s16)) layer = InstanceNormalization(layer, True) layer = weight_norm( TransposedConv2DLayer(layer, 384, 5, 2, 'same', output_size=(s8, s8), nonlinearity=None, b=None), transposed=True) if self.reg: layer = dropout(layer) layer = NonlinearityLayer(layer, rectify) layer = weight_norm( TransposedConv2DLayer(layer, 256, 5, 2, 'same', output_size=(s4, s4), nonlinearity=None, b=None), transposed=True) if self.reg: layer = dropout(layer) layer = NonlinearityLayer(layer, rectify) layer = weight_norm( TransposedConv2DLayer(layer, 192, 5, 2, 'same', output_size=(s2, s2), nonlinearity=None, b=None), transposed=True) if self.reg: layer = dropout(layer) layer = NonlinearityLayer(layer, rectify) layer_img = TransposedConv2DLayer(layer, 3, 5, 2, 'same', output_size=(s, s), nonlinearity=tanh) layer_msk = TransposedConv2DLayer(layer, 1, 5, 2, 'same', output_size=(s, s), nonlinearity=sigmoid) layer = ConcatLayer([layer_img, layer_msk]) outputs = OrderedDict() outputs['x'] = layer self.gen_inputs = inputs self.gen_outputs = outputs
def vgg16(input_var=None, image_size=256): from lasagne.layers import InputLayer from lasagne.layers import DenseLayer from lasagne.layers import NonlinearityLayer from lasagne.layers import DropoutLayer from lasagne.layers import Pool2DLayer as PoolLayer from lasagne.layers.dnn import Conv2DDNNLayer as ConvLayer from lasagne.nonlinearities import softmax net = {} net['input'] = InputLayer((None, 4, image_size, image_size), input_var=input_var) net['conv1_1'] = ConvLayer(net['input'], 64, 3, pad=1, flip_filters=False) net['conv1_2'] = ConvLayer(net['conv1_1'], 64, 3, pad=1, flip_filters=False) net['pool1'] = PoolLayer(net['conv1_2'], 2) net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad=1, flip_filters=False) net['conv2_2'] = ConvLayer(net['conv2_1'], 128, 3, pad=1, flip_filters=False) net['pool2'] = PoolLayer(net['conv2_2'], 2) net['conv3_1'] = ConvLayer(net['pool2'], 256, 3, pad=1, flip_filters=False) net['conv3_2'] = ConvLayer(net['conv3_1'], 256, 3, pad=1, flip_filters=False) net['conv3_3'] = ConvLayer(net['conv3_2'], 256, 3, pad=1, flip_filters=False) net['pool3'] = PoolLayer(net['conv3_3'], 2) net['conv4_1'] = ConvLayer(net['pool3'], 512, 3, pad=1, flip_filters=False) net['conv4_2'] = ConvLayer(net['conv4_1'], 512, 3, pad=1, flip_filters=False) net['conv4_3'] = ConvLayer(net['conv4_2'], 512, 3, pad=1, flip_filters=False) net['pool4'] = PoolLayer(net['conv4_3'], 2) net['conv5_1'] = ConvLayer(net['pool4'], 512, 3, pad=1, flip_filters=False) net['conv5_2'] = ConvLayer(net['conv5_1'], 512, 3, pad=1, flip_filters=False) net['conv5_3'] = ConvLayer(net['conv5_2'], 512, 3, pad=1, flip_filters=False) net['pool5'] = PoolLayer(net['conv5_3'], 2) net['fc6'] = DenseLayer(net['pool5'], num_units=4096) net['fc6_dropout'] = DropoutLayer(net['fc6'], p=0.5) net['fc7'] = DenseLayer(net['fc6_dropout'], num_units=4096) net['fc7_dropout'] = DropoutLayer(net['fc7'], p=0.5) net['fc8'] = DenseLayer(net['fc7_dropout'], num_units=1000, nonlinearity=lasagne.nonlinearities.sigmoid) return net['fc8']
def build_vgg_model(): net = {} net['input'] = InputLayer((None, 3, 224, 224)) net['conv1_1'] = ConvLayer(net['input'], 64, 3, pad=1, flip_filters=False) net['conv1_2'] = ConvLayer(net['conv1_1'], 64, 3, pad=1, flip_filters=False) net['pool1'] = PoolLayer(net['conv1_2'], 2) net['conv2_1'] = ConvLayer(net['pool1'], 128, 3, pad=1, flip_filters=False) net['conv2_2'] = ConvLayer(net['conv2_1'], 128, 3, pad=1, flip_filters=False) net['pool2'] = PoolLayer(net['conv2_2'], 2) net['conv3_1'] = ConvLayer(net['pool2'], 256, 3, pad=1, flip_filters=False) net['conv3_2'] = ConvLayer(net['conv3_1'], 256, 3, pad=1, flip_filters=False) net['conv3_3'] = ConvLayer(net['conv3_2'], 256, 3, pad=1, flip_filters=False) net['conv3_4'] = ConvLayer(net['conv3_3'], 256, 3, pad=1, flip_filters=False) net['pool3'] = PoolLayer(net['conv3_4'], 2) net['conv4_1'] = ConvLayer(net['pool3'], 512, 3, pad=1, flip_filters=False) net['conv4_2'] = ConvLayer(net['conv4_1'], 512, 3, pad=1, flip_filters=False) net['conv4_3'] = ConvLayer(net['conv4_2'], 512, 3, pad=1, flip_filters=False) net['conv4_4'] = ConvLayer(net['conv4_3'], 512, 3, pad=1, flip_filters=False) net['pool4'] = PoolLayer(net['conv4_4'], 2) net['conv5_1'] = ConvLayer(net['pool4'], 512, 3, pad=1, flip_filters=False) net['conv5_2'] = ConvLayer(net['conv5_1'], 512, 3, pad=1, flip_filters=False) net['conv5_3'] = ConvLayer(net['conv5_2'], 512, 3, pad=1, flip_filters=False) net['conv5_4'] = ConvLayer(net['conv5_3'], 512, 3, pad=1, flip_filters=False) net['pool5'] = PoolLayer(net['conv5_4'], 2) net['fc6'] = DenseLayer(net['pool5'], num_units=4096) net['fc6_dropout'] = DropoutLayer(net['fc6'], p=0.5) net['fc7'] = DenseLayer(net['fc6_dropout'], num_units=4096) net['fc7_dropout'] = DropoutLayer(net['fc7'], p=0.5) net['fc8'] = DenseLayer(net['fc7_dropout'], num_units=1000, nonlinearity=None) net['prob'] = NonlinearityLayer(net['fc8'], softmax) # Remove the trainable argument from the layers that can potentially have it for key, val in net.iteritems(): if not ('dropout' or 'pool' in key): net[key].params[net[key].W].remove("trainable") net[key].params[net[key].b].remove("trainable") return net
def build_convpool_mix(input_vars, nb_classes, grad_clip=110, imsize=32, n_colors=3, n_timewin=3): """ Builds the complete network with LSTM and 1D-conv layers combined :param input_vars: list of EEG images (one image per time window) :param nb_classes: number of classes :param grad_clip: the gradient messages are clipped to the given value during the backward pass. :param imsize: size of the input image (assumes a square input) :param n_colors: number of color channels in the image :param n_timewin: number of time windows in the snippet :return: a pointer to the output of last layer """ convnets = [] w_init = None # Build 7 parallel CNNs with shared weights for i in range(n_timewin): if i == 0: convnet, w_init = build_cnn(input_vars[i], imsize=imsize, n_colors=n_colors) else: convnet, _ = build_cnn(input_vars[i], w_init=w_init, imsize=imsize, n_colors=n_colors) convnets.append(FlattenLayer(convnet)) # at this point convnets shape is [numTimeWin][n_samples, features] # we want the shape to be [n_samples, features, numTimeWin] convpool = ConcatLayer(convnets) convpool = ReshapeLayer(convpool, ([0], n_timewin, get_output_shape(convnets[0])[1])) #print('1.convpool:', convpool.shape) #[0], 3, 2048 reformConvpool = DimshuffleLayer(convpool, (0, 2, 1)) #print('1.5. convpool reshape:', reformConvpool.output_shape) #None 2048, 3 # input to 1D convlayer should be in (batch_size, num_input_channels, input_length) conv_out = Conv1DLayer(reformConvpool, 64, 3) #print('2. conv_out shape:', conv_out.output_shape) #None, 64, 1 conv_out = FlattenLayer(conv_out) #print('2.5. conv_out shape:', conv_out.output_shape) #None, 64 # Input to LSTM should have the shape as (batch size, SEQ_LENGTH, num_features) lstm = LSTMLayer(convpool, num_units=128, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh) #print('3 lstm:', lstm.output_shape) #None, 3, 128 lstm_out = SliceLayer(lstm, -1, 1) #print('3.5 lstmout:', lstm_out.output_shape) #None, 128 # Merge 1D-Conv and LSTM outputs dense_input = ConcatLayer([conv_out, lstm_out]) #None, 192 #print('4 dense:', dense_input.output_shape) # A fully-connected layer of 256 units with 50% dropout on its inputs: convpool = DenseLayer(lasagne.layers.dropout(dense_input, p=.5), num_units=512, nonlinearity=lasagne.nonlinearities.rectify) # And, finally, the 10-unit output layer with 50% dropout on its inputs: convpool = DenseLayer(convpool, num_units=nb_classes, nonlinearity=lasagne.nonlinearities.softmax) return convpool
def create_network(available_actions_num): # Creates the input variables s1 = tensor.tensor4("States") a = tensor.vector("Actions", dtype="int32") q2 = tensor.vector("Next State best Q-Value") r = tensor.vector("Rewards") nonterminal = tensor.vector("Nonterminal", dtype="int8") # Creates the input layer of the network. dqn = InputLayer(shape=[None, 1, downsampled_y, downsampled_x], input_var=s1) # Adds 3 convolutional layers, each followed by a max pooling layer. dqn = Conv2DLayer(dqn, num_filters=32, filter_size=[8, 8], nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) dqn = MaxPool2DLayer(dqn, pool_size=[2, 2]) dqn = Conv2DLayer(dqn, num_filters=64, filter_size=[4, 4], nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) dqn = MaxPool2DLayer(dqn, pool_size=[2, 2]) dqn = Conv2DLayer(dqn, num_filters=64, filter_size=[3, 3], nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) dqn = MaxPool2DLayer(dqn, pool_size=[2, 2]) # Adds a single fully connected layer. dqn = DenseLayer(dqn, num_units=512, nonlinearity=rectify, W=GlorotUniform("relu"), b=Constant(.1)) # Adds a single fully connected layer which is the output layer. # (no nonlinearity as it is for approximating an arbitrary real function) dqn = DenseLayer(dqn, num_units=available_actions_num, nonlinearity=None) # Theano stuff q = get_output(dqn) # Only q for the chosen actions is updated more or less according to following formula: # target Q(s,a,t) = r + gamma * max Q(s2,_,t+1) target_q = tensor.set_subtensor(q[tensor.arange(q.shape[0]), a], r + discount_factor * nonterminal * q2) loss = squared_error(q, target_q).mean() # Updates the parameters according to the computed gradient using rmsprop. params = get_all_params(dqn, trainable=True) updates = rmsprop(loss, params, learning_rate) # Compiles theano functions print "Compiling the network ..." function_learn = theano.function([s1, q2, a, r, nonterminal], loss, updates=updates, name="learn_fn") function_get_q_values = theano.function([s1], q, name="eval_fn") function_get_best_action = theano.function([s1], tensor.argmax(q), name="test_fn") print "Network compiled." # Returns Theano objects for the net and functions. # We wouldn't need the net anymore but it is nice to save your model. return dqn, function_learn, function_get_q_values, function_get_best_action
def lasagne_separate2(M, P, FE, W1, W2, z1, z2, hh=.0001, ep=5000, d=0, wsp=.0001, plt=True): from paris.signal import bss_eval # Gt dictionary shapes K = [W1.shape[0], W2.shape[0]] # GPU cached data _M = theano.shared(M.T.astype(float32)) dum = Th.vector('dum') # We have weights to discover H = theano.shared(random.rand(M.T.shape[0], K[0] + K[1]).astype(float32)) fI = InputLayer(shape=(M.T.shape[0], K[0] + K[1]), input_var=H) # Split in two pathways fW1 = SliceLayer(fI, indices=slice(0, K[0]), axis=1) fW2 = SliceLayer(fI, indices=slice(K[0], K[0] + K[1]), axis=1) # Dropout? dfW1 = DropoutLayer(fW1, d) dfW2 = DropoutLayer(fW2, d) # Compute source modulators using previously learned dictionaries R1 = DenseLayer(dfW1, num_units=M.shape[0], W=W1.astype(float32), nonlinearity=lambda x: psoftplus(x, 3.), b=None) R2 = DenseLayer(dfW2, num_units=M.shape[0], W=W2.astype(float32), nonlinearity=lambda x: psoftplus(x, 3.), b=None) # Add the two approximations R = ElemwiseSumLayer([R1, R2]) # Cost function cost = (_M*(Th.log(_M+eps) - Th.log( get_output( R)+eps)) - _M + get_output( R)).mean() \ + wsp*Th.mean( H) + 0*Th.mean( dum) # Train it using Lasagne opt = downhill.build('rprop', loss=cost, inputs=[dum], params=[H]) train = downhill.Dataset(array([0]).astype(float32), batch_size=0) er = downhill_train(opt, train, hh, ep, None)[-1] # Get outputs _r = nget(R, dum, array([0]).astype(float32)) + eps _r1 = nget(R1, dum, array([0]).astype(float32)) _r2 = nget(R2, dum, array([0]).astype(float32)) o1 = FE.ife(_r1 * (M / _r), P) o2 = FE.ife(_r2 * (M / _r), P) sxr = bss_eval(o1, 0, vstack((z1, z2))) + bss_eval(o2, 1, vstack((z1, z2))) return o1, o2, (array(sxr[:3]) + array(sxr[3:])) / 2.
def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities. rectify(x), weights_init=init.OneHot(), learn_init=False, **kwargs): super(Head, self).__init__(controller, **kwargs) self.memory_shape = memory_shape self.basename = kwargs.get('name', 'head') self.learn_init = learn_init if W_hid_to_sign is not None: self.sign = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign, name=self.basename + '.sign') self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b else: self.sign = None self.W_hid_to_sign, self.b_hid_to_sign = None, None self.key = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key, name=self.basename + '.key') self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b self.beta = DenseLayer(controller, num_units=1, W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta, name=self.basename + '.beta') self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b self.gate = DenseLayer(controller, num_units=1, W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate, name=self.basename + '.gate') self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b self.num_shifts = num_shifts self.shift = DenseLayer(controller, num_units=num_shifts, W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift, name=self.basename + '.shift') self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b self.gamma = DenseLayer(controller, num_units=1, W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma, name=self.basename + '.gamma') self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b self.weights_init = self.add_param(weights_init, (1, self.memory_shape[0]), name='weights_init', trainable=learn_init, regularizable=False)
def build_cnn(input_var=None, n=5): # create a residual learning building block with two stacked 3x3 convlayers as in paper def residual_block(l, increase_dim=False, projection=False): input_num_filters = l.output_shape[1] if increase_dim: first_stride = (2, 2) out_num_filters = input_num_filters * 2 else: first_stride = (1, 1) out_num_filters = input_num_filters #print(l.output_shape) l_l = DenseLayer(l, num_units=l.output_shape[3], num_leading_axes=-1, nonlinearity=None) #print(l.output_shape[3]) #print("l_1.output_shape", l_l.output_shape) #stride=first_stride stack_left_1 = batch_norm( ConvLayer(l_l, num_filters=out_num_filters, filter_size=(3, 3), stride=first_stride, nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) stack_left_2 = batch_norm( ConvLayer(stack_left_1, num_filters=out_num_filters, filter_size=(3, 3), stride=(1, 1), nonlinearity=None, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) #stack_right_1 = batch_norm(ConvLayer(ElemwiseSumLayer([l, NegativeLayer(l_l)]), num_filters=out_num_filters, filter_size=(2,2), stride=first_stride, nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) #stack_right_2 = batch_norm(ConvLayer(stack_right_1, num_filters=out_num_filters, filter_size=(2,2), stride=(1,1), nonlinearity=None, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) print("first stack: ", stack_left_2.output_shape) # add shortcut connections if increase_dim: if projection: # projection shortcut, as option B in paper projection = batch_norm( ConvLayer(l, num_filters=out_num_filters, filter_size=(1, 1), stride=(2, 2), nonlinearity=None, pad='same', b=None, flip_filters=False)) print("projection shape: ", projection.output_shape) ##block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, projection]),nonlinearity=rectify) block = NonlinearityLayer(ElemwiseSumLayer( [stack_left_2, projection]), nonlinearity=rectify) else: # identity shortcut, as option A in paper #print(l.output_shape[2]) if (l.output_shape[2] % 2 == 0 and l.output_shape[3] % 2 == 0): identity = ExpressionLayer( l, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], s[2] // 2, s[3] // 2)) elif (l.output_shape[2] % 2 == 0 and l.output_shape[3] % 2 == 1): identity = ExpressionLayer( l, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], s[2] // 2, s[3] // 2 + 1)) elif (l.output_shape[2] % 2 == 1 and l.output_shape[3] % 2 == 0): identity = ExpressionLayer( l, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], s[2] // 2 + 1, s[3] // 2)) else: identity = ExpressionLayer( l, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], s[2] // 2 + 1, s[3] // 2 + 1)) padding = PadLayer(identity, [(int)(out_num_filters / 4), 0, 0], batch_ndim=1) print('------------------') print(stack_left_2.output_shape) #print(stack_right_2.output_shape) print(identity.output_shape) print(padding.output_shape) #block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, padding]),nonlinearity=rectify) block = NonlinearityLayer(ElemwiseSumLayer( [stack_left_2, padding]), nonlinearity=rectify) else: #block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, l]),nonlinearity=rectify) print("l output shape: ", l.output_shape) block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, l]), nonlinearity=rectify) return block # Building the network l_in = InputLayer(shape=(None, 16, 512, 660), input_var=input_var) # first layer, output is 16 x 32 x 32 l = batch_norm( ConvLayer(l_in, num_filters=16, filter_size=(3, 3), stride=(4, 4), nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) print(l.output_shape) # first stack of residual blocks, output is 16 x 32 x 32 for _ in range(n): l = residual_block(l) #l = DropoutLayer(l, p = 0.7) #print(l.output_shape) #print(l.output_shape) l = residual_block(l, increase_dim=True) #l = DropoutLayer(l, p = 0.5) for _ in range(n): l = residual_block(l) #l = DropoutLayer(l, p = 0.5) print(l.output_shape) l = batch_norm( ConvLayer(l, num_filters=32, filter_size=(3, 3), stride=(2, 2), nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) #l = residual_block(l, increase_dim=True) #for _ in range(n): # l = residual_block(l) #print(l.output_shape) #second stack of residual blocks, output is 32 x 16 x 16 #l = batch_norm(ConvLayer(l, num_filters = 64, filter_size=(3,3), stride=(2,2), nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) #l = residual_block(l, increase_dim=True) #for _ in range(n): # l = residual_block(l) #print(l.output_shape) """ # third stack of residual blocks, output is 64 x 8 x 8 l = residual_block(l, increase_dim=True) for _ in range(1,n): l = residual_block(l) """ # average pooling l = GlobalPoolLayer(l) print("before dense: ", l.output_shape) # fully connected layer network = DenseLayer(l, num_units=1, W=lasagne.init.HeNormal(), nonlinearity=sigmoid) return network
helper.set_all_param_values(l_arc, params[:2]) return embedding_fn worker = OmniglotOS(image_size=32, batch_size=1024) X_test, y_test = worker.fetch_batch('test') for glimpses in range(1, 9): embedding_fn = create_embedder_fn(glimpses) X = T.matrix("embedding") y = T.imatrix("target") l_in = InputLayer(shape=(None, 512), input_var=X) l_y = DenseLayer(l_in, 1, nonlinearity=sigmoid) prediction = get_output(l_y) loss = T.mean(binary_crossentropy(prediction, y)) accuracy = T.mean(binary_accuracy(prediction, y)) params = get_all_params(l_y) updates = adam(loss, params, learning_rate=1e-3) train_fn = theano.function([X, y], outputs=loss, updates=updates) val_fn = theano.function([X, y], outputs=[loss, accuracy]) for i in range(250): X_train, y_train = worker.fetch_batch('train') train_fn(embedding_fn(X_train), y_train) X_train, y_train = worker.fetch_batch('train') train_loss = train_fn(embedding_fn(X_train), y_train) val_loss, val_acc = val_fn(embedding_fn(X_test), y_test)
def residual_block(l, increase_dim=False, projection=False): input_num_filters = l.output_shape[1] if increase_dim: first_stride = (2, 2) out_num_filters = input_num_filters * 2 else: first_stride = (1, 1) out_num_filters = input_num_filters #print(l.output_shape) l_l = DenseLayer(l, num_units=l.output_shape[3], num_leading_axes=-1, nonlinearity=None) #print(l.output_shape[3]) #print("l_1.output_shape", l_l.output_shape) #stride=first_stride stack_left_1 = batch_norm( ConvLayer(l_l, num_filters=out_num_filters, filter_size=(3, 3), stride=first_stride, nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) stack_left_2 = batch_norm( ConvLayer(stack_left_1, num_filters=out_num_filters, filter_size=(3, 3), stride=(1, 1), nonlinearity=None, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) #stack_right_1 = batch_norm(ConvLayer(ElemwiseSumLayer([l, NegativeLayer(l_l)]), num_filters=out_num_filters, filter_size=(2,2), stride=first_stride, nonlinearity=rectify, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) #stack_right_2 = batch_norm(ConvLayer(stack_right_1, num_filters=out_num_filters, filter_size=(2,2), stride=(1,1), nonlinearity=None, pad='same', W=lasagne.init.HeNormal(gain='relu'), flip_filters=False)) print("first stack: ", stack_left_2.output_shape) # add shortcut connections if increase_dim: if projection: # projection shortcut, as option B in paper projection = batch_norm( ConvLayer(l, num_filters=out_num_filters, filter_size=(1, 1), stride=(2, 2), nonlinearity=None, pad='same', b=None, flip_filters=False)) print("projection shape: ", projection.output_shape) ##block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, projection]),nonlinearity=rectify) block = NonlinearityLayer(ElemwiseSumLayer( [stack_left_2, projection]), nonlinearity=rectify) else: # identity shortcut, as option A in paper #print(l.output_shape[2]) if (l.output_shape[2] % 2 == 0 and l.output_shape[3] % 2 == 0): identity = ExpressionLayer( l, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], s[2] // 2, s[3] // 2)) elif (l.output_shape[2] % 2 == 0 and l.output_shape[3] % 2 == 1): identity = ExpressionLayer( l, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], s[2] // 2, s[3] // 2 + 1)) elif (l.output_shape[2] % 2 == 1 and l.output_shape[3] % 2 == 0): identity = ExpressionLayer( l, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], s[2] // 2 + 1, s[3] // 2)) else: identity = ExpressionLayer( l, lambda X: X[:, :, ::2, ::2], lambda s: (s[0], s[1], s[2] // 2 + 1, s[3] // 2 + 1)) padding = PadLayer(identity, [(int)(out_num_filters / 4), 0, 0], batch_ndim=1) print('------------------') print(stack_left_2.output_shape) #print(stack_right_2.output_shape) print(identity.output_shape) print(padding.output_shape) #block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, padding]),nonlinearity=rectify) block = NonlinearityLayer(ElemwiseSumLayer( [stack_left_2, padding]), nonlinearity=rectify) else: #block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, stack_right_2, l]),nonlinearity=rectify) print("l output shape: ", l.output_shape) block = NonlinearityLayer(ElemwiseSumLayer([stack_left_2, l]), nonlinearity=rectify) return block
def regr(X, y): l = InputLayer(shape=(None, X.shape[1])) l = DenseLayer(l, num_units=y.shape[1], nonlinearity=None) net = NeuralNet(l, regression=True, update_learning_rate=0.01) net.fit(X, y) print(net.score(X, y))
def build_network(): net = {} net['input'] = InputLayer((None, 3, 299, 299)) net['conv'] = bn_conv(net['input'], num_filters=32, filter_size=3, stride=2) net['conv_1'] = bn_conv(net['conv'], num_filters=32, filter_size=3) net['conv_2'] = bn_conv(net['conv_1'], num_filters=64, filter_size=3, pad=1) net['pool'] = Pool2DLayer(net['conv_2'], pool_size=3, stride=2, mode='max') net['conv_3'] = bn_conv(net['pool'], num_filters=80, filter_size=1) net['conv_4'] = bn_conv(net['conv_3'], num_filters=192, filter_size=3) net['pool_1'] = Pool2DLayer(net['conv_4'], pool_size=3, stride=2, mode='max') net['mixed/join'] = inceptionA( net['pool_1'], nfilt=((64,), (48, 64), (64, 96, 96), (32,))) net['mixed_1/join'] = inceptionA( net['mixed/join'], nfilt=((64,), (48, 64), (64, 96, 96), (64,))) net['mixed_2/join'] = inceptionA( net['mixed_1/join'], nfilt=((64,), (48, 64), (64, 96, 96), (64,))) net['mixed_3/join'] = inceptionB( net['mixed_2/join'], nfilt=((384,), (64, 96, 96))) net['mixed_4/join'] = inceptionC( net['mixed_3/join'], nfilt=((192,), (128, 128, 192), (128, 128, 128, 128, 192), (192,))) net['mixed_5/join'] = inceptionC( net['mixed_4/join'], nfilt=((192,), (160, 160, 192), (160, 160, 160, 160, 192), (192,))) net['mixed_6/join'] = inceptionC( net['mixed_5/join'], nfilt=((192,), (160, 160, 192), (160, 160, 160, 160, 192), (192,))) net['mixed_7/join'] = inceptionC( net['mixed_6/join'], nfilt=((192,), (192, 192, 192), (192, 192, 192, 192, 192), (192,))) net['mixed_8/join'] = inceptionD( net['mixed_7/join'], nfilt=((192, 320), (192, 192, 192, 192))) net['mixed_9/join'] = inceptionE( net['mixed_8/join'], nfilt=((320,), (384, 384, 384), (448, 384, 384, 384), (192,)), pool_mode='average_exc_pad') net['mixed_10/join'] = inceptionE( net['mixed_9/join'], nfilt=((320,), (384, 384, 384), (448, 384, 384, 384), (192,)), pool_mode='max') net['pool3'] = GlobalPoolLayer(net['mixed_10/join']) net['softmax'] = DenseLayer( net['pool3'], num_units=1008, nonlinearity=softmax) return net
class ShallowNeuralForest: def __init__(self, n_inputs, n_outputs, regression, multiclass=False, depth=5, n_estimators=20, n_hidden=128, learning_rate=0.01, num_epochs=500, pi_iters=20, sgd_iters=10, batch_size=1000, momentum=0.0, dropout=0.0, loss=None, update=adagrad): """ Parameters ---------- n_inputs : number of input features n_outputs : number of classes to predict (1 for regression) for 2 class classification n_outputs should be 2, not 1 regression : True for regression, False for classification multiclass : not used depth : depth of each tree in the ensemble n_estimators : number of trees in the ensemble n_hidden : number of neurons in the hidden layer pi_iters : number of iterations for the iterative algorithm that updates pi sgd_iters : number of full iterations of sgd between two consequtive updates of pi loss : theano loss function. If None, squared error will be used for regression and cross entropy will be used for classification update : theano update function """ self._depth = depth self._n_estimators = n_estimators self._n_hidden = n_hidden self._n_outputs = n_outputs self._loss = loss self._regression = regression self._multiclass = multiclass self._learning_rate = learning_rate self._num_epochs = num_epochs self._pi_iters = pi_iters self._sgd_iters = sgd_iters self._batch_size = batch_size self._momentum = momentum self._update = update self.t_input = T.matrix('input') self.t_label = T.matrix('output') self._cached_trainable_params = None self._cached_params = None self._n_net_out = n_estimators * ((1 << depth) - 1) self.l_input = InputLayer((None, n_inputs)) self.l_dense1 = DenseLayer(self.l_input, self._n_hidden, nonlinearity=rectify) if dropout != 0: self.l_dense1 = DropoutLayer(self.l_dense1, p=dropout) if not __DEBUG_NO_FOREST__: self.l_dense2 = DenseLayer(self.l_dense1, self._n_net_out, nonlinearity=sigmoid) self.l_forest = NeuralForestLayer(self.l_dense2, self._depth, self._n_estimators, self._n_outputs, self._pi_iters) else: self.l_forest = DenseLayer(self.l_dense1, self._n_outputs, nonlinearity=softmax) def _create_functions(self): self._update_func = self._update(self._get_loss_function(), self._get_all_trainable_params(), self._learning_rate) if self._momentum != 0: self._update_func = apply_nesterov_momentum(self._update_func, self._get_all_trainable_params(), self._momentum) self._loss_func = self._get_loss_function() self._train_function = theano.function([self.t_input, self.t_label], self._get_loss_function(), updates=self._update_func) def fit(self, X, y, X_val = None, y_val = None, on_epoch = None, verbose = False): """ Train the model Parameters ---------- X : input vector for the training set y : output vector for the training set. Onehot is required for classification X_val : if not None, input vector for the validation set y_val : it not None, input vector for the validation set on_epoch : a callback that is called after each epoch if X_val is None, the signature is (epoch, training_error, accuracy) if X_val is not None, the signature is (epoch, training_error, validation_error, accuracy) on iterations that update pi the training error is reported for the previous iteration verbose : if True, spams current step on each epoch """ self._create_functions() X = X.astype(np.float32) y = y.astype(np.float32) self._x_mean = np.mean(X, axis=0) self._x_std = np.std(X, axis=0) self._x_std[self._x_std == 0] = 1 X = (X - self._x_mean) / self._x_std if y_val is not None: assert X_val is not None X_val = X_val.astype(np.float32) y_val = y_val.astype(np.float32) X_val = (X_val - self._x_mean) / self._x_std if X_val is not None: assert y_val is not None predictions = self._predict_internal(self._get_output()) accuracy = T.mean(T.eq(predictions, self._predict_internal(self.t_label))) test_function = theano.function([self.t_input, self.t_label], [self._get_loss_function(), accuracy]) iterator = BatchIterator(self._batch_size) loss = 0 for epoch in range(self._num_epochs): # update the values of pi if not __DEBUG_NO_FOREST__ and epoch % self._sgd_iters == 0: if verbose: print "updating pi" self.l_forest.update_pi(X, y) if verbose: print "recreating update funcs" self._create_functions() else: if verbose: print "updating theta" loss = 0 deno = 0 # update the network parameters for Xb, yb in iterator(X, y): loss += self._train_function(Xb, yb) deno += 1 loss /= deno if X_val is not None: tloss = 0 accur = 0 deno = 0 iterator = BatchIterator(self._batch_size) for Xb, yb in iterator(X_val, y_val): tl, ac = test_function(Xb, yb) tloss += tl accur += ac deno += 1 tloss /= deno accur /= deno if on_epoch is not None: if X_val is None: on_epoch(epoch, loss) else: on_epoch(epoch, loss, tloss, accur) return self def _predict_internal(self, y): if not self._regression and not self._multiclass: return y.argmax(axis=1) else: return y >= 0.5 def predict(self, X): ret = self.predict_proba(X) return self._predict_internal(ret) def predict_proba(self, X): X = X.astype(np.float32) X = (X - self._x_mean) / self._x_std predict_function = theano.function([self.t_input], self._get_output()) return predict_function(X) def _get_loss_function(self): # TODO: remove `or True` if self._loss is None: if self._regression: self._loss = squared_error else: self._loss = categorical_crossentropy return aggregate(self._loss(self._get_output(), self.t_label), mode='mean') def _get_output(self): return get_output(self.l_forest, self.t_input) def _get_all_trainable_params(self): if self._cached_trainable_params is None: self._cached_trainable_params = get_all_params(self.l_forest, trainable=True) return self._cached_trainable_params def _get_all_params(self): if self._cached_params is None: self._cached_params = get_all_params(self.l_forest) return self._cached_params
def build_model(): net = {} net['input'] = InputLayer((None, 3, 224, 224)) sub_net, parent_layer_name = build_simple_block( net['input'], ['conv1', 'bn_conv1', 'conv1_relu'], 64, 7, 3, 2, use_bias=True) net.update(sub_net) net['pool1'] = PoolLayer(net[parent_layer_name], pool_size=3, stride=2, pad=0, mode='max', ignore_border=False) block_size = list('abc') parent_layer_name = 'pool1' for c in block_size: if c == 'a': sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1, 1, True, 4, ix='2%s' % c) else: sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='2%s' % c) net.update(sub_net) block_size = list('abcd') for c in block_size: if c == 'a': sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 2, 1.0 / 2, True, 4, ix='3%s' % c) else: sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='3%s' % c) net.update(sub_net) block_size = list('abcdef') for c in block_size: if c == 'a': sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 2, 1.0 / 2, True, 4, ix='4%s' % c) else: sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='4%s' % c) net.update(sub_net) block_size = list('abc') for c in block_size: if c == 'a': sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 2, 1.0 / 2, True, 4, ix='5%s' % c) else: sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='5%s' % c) net.update(sub_net) net['pool5'] = PoolLayer(net[parent_layer_name], pool_size=7, stride=1, pad=0, mode='average_exc_pad', ignore_border=False) net['fc1000'] = DenseLayer(net['pool5'], num_units=1000, nonlinearity=None) net['prob'] = NonlinearityLayer(net['fc1000'], nonlinearity=softmax) return net
class Head(Layer): r""" The base class :class:`Head` represents a generic head for the Neural Turing Machine. The heads are responsible for the read/write operations on the memory. An instance of :class:`Head` outputs a weight vector defined by .. math :: \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\ k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\ \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\ g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\ s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\ \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma}) .. math :: w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\ w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\ \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\ w_{t} \propto \tilde{w}_{t}^{\gamma_{t}} Parameters ---------- controller: a :class:`Controller` instance The controller of the Neural Turing Machine. num_shifts: int Number of shifts allowed by the convolutional shift operation (centered on 0, eg. ``num_shifts=3`` represents shifts in [-1, 0, 1]). memory_shape: tuple Shape of the NTM's memory W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the weights for the parameter :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape ``(controller.num_units, memory_shape[1])``. b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the biases for the parameter :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix with shape ``(memory_shape[1],)``. nonlinearity_sign: callable or ``None`` The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If ``None``, the nonlinearity is ``identity``. W_hid_to_key: callable, Numpy array or Theano shared variable b_hid_to_key: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_key: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`k_{t}`. W_hid_to_beta: callable, Numpy array or Theano shared variable b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_beta: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\beta_{t}`. W_hid_to_gate: callable, Numpy array or Theano shared variable b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gate: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`g_{t}`. W_hid_to_shift: callable, Numpy array or Theano shared variable b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_shift: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`s_{t}`. W_hid_to_gamma: callable, Numpy array or Theano shared variable b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gamma: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\gamma_{t}` weights_init: callable, Numpy array or Theano shared variable Initializer for the initial weight vector (:math:`w_{0}`). learn_init: bool If ``True``, initial hidden values are learned. """ def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities.rectify(x), weights_init=init.OneHot(), learn_init=False, **kwargs): super(Head, self).__init__(controller, **kwargs) self.memory_shape = memory_shape self.basename = kwargs.get('name', 'head') self.learn_init = learn_init if W_hid_to_sign is not None: self.sign = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign, name=self.basename + '.sign') self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b else: self.sign = None self.W_hid_to_sign, self.b_hid_to_sign = None, None self.key = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key, name=self.basename + '.key') self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b self.beta = DenseLayer(controller, num_units=1, W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta, name=self.basename + '.beta') self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b self.gate = DenseLayer(controller, num_units=1, W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate, name=self.basename + '.gate') self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b self.num_shifts = num_shifts self.shift = DenseLayer(controller, num_units=num_shifts, W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift, name=self.basename + '.shift') self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b self.gamma = DenseLayer(controller, num_units=1, W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma, name=self.basename + '.gamma') self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b self.weights_init = self.add_param( weights_init, (1, self.memory_shape[0]), name='weights_init', trainable=learn_init, regularizable=False) def get_output_for(self, h_t, w_tm1, M_t, **kwargs): if self.sign is not None: sign_t = self.sign.get_output_for(h_t, **kwargs) else: sign_t = 1. k_t = self.key.get_output_for(h_t, **kwargs) beta_t = self.beta.get_output_for(h_t, **kwargs) g_t = self.gate.get_output_for(h_t, **kwargs) s_t = self.shift.get_output_for(h_t, **kwargs) gamma_t = self.gamma.get_output_for(h_t, **kwargs) # Content Adressing (3.3.1) beta_t = T.addbroadcast(beta_t, 1) betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 1) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d(w_g_padded, conv_filter, input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(self.input_shape[0], 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[:, 0, 0, :] # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 1) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w) return w def get_params(self, **tags): params = super(Head, self).get_params(**tags) if self.sign is not None: params += self.sign.get_params(**tags) params += self.key.get_params(**tags) params += self.beta.get_params(**tags) params += self.gate.get_params(**tags) params += self.shift.get_params(**tags) params += self.gamma.get_params(**tags) return params
def build_convpool_lstm(input_vars, nb_classes, grad_clip=110, imsize=32, n_colors=3, n_timewin=3): """ Builds the complete network with LSTM layer to integrate time from sequences of EEG images. :param input_vars: list of EEG images (one image per time window) :param nb_classes: number of classes :param grad_clip: the gradient messages are clipped to the given value during the backward pass. :param imsize: size of the input image (assumes a square input) :param n_colors: number of color channels in the image :param n_timewin: number of time windows in the snippet :return: a pointer to the output of last layer """ convnets = [] w_init = None # Build 7 parallel CNNs with shared weights for i in range(n_timewin): if i == 0: convnet, w_init = build_cnn(input_vars[i], imsize=imsize, n_colors=n_colors) else: convnet, _ = build_cnn(input_vars[i], w_init=w_init, imsize=imsize, n_colors=n_colors) convnets.append(FlattenLayer(convnet)) #print(convnet.output_shape) #None, 128, 4, 4 #print('0.:', convnets[0].output_shape) #None, 2048... 128*4*4 # at this point convnets shape is [numTimeWin][n_samples, features] # we want the shape to be [n_samples, features, numTimeWin] convpool = ConcatLayer(convnets) #print('1.concat:', convpool.output_shape) #None, 6144 convpool = ReshapeLayer(convpool, ([0], n_timewin, get_output_shape(convnets[0])[1])) # Input to LSTM should have the shape as (batch size, SEQ_LENGTH, num_features) #print('2.Reshape:', convpool.output_shape) #None, 3, 2048 convpool = LSTMLayer(convpool, num_units=128, grad_clipping=grad_clip, nonlinearity=lasagne.nonlinearities.tanh) # We only need the final prediction, we isolate that quantity and feed it # to the next layer. #print('3.LSTM:', convpool.output_shape) #None, 3, 128 convpool = SliceLayer(convpool, -1, 1) # Selecting the last prediction # A fully-connected layer of 256 units with 50% dropout on its inputs: #print('4.slice:', convpool.output_shape) #None, 128 convpool = DenseLayer(lasagne.layers.dropout(convpool, p=.5), num_units=256, nonlinearity=lasagne.nonlinearities.rectify) # And, finally, the output layer with 50% dropout on its inputs: convpool = DenseLayer(lasagne.layers.dropout(convpool, p=.5), num_units=nb_classes, nonlinearity=lasagne.nonlinearities.softmax) return convpool
def build_network(): """ Returns ------- """ input_var = t.tensor4('inputs') target = t.matrix('targets') lr = t.scalar('lr', dtype=theano.config.floatX) poolmode = 'average_exc_pad' new_pool_size2 = 3 net = {'input': InputLayer((None, 3, 299, 299), input_var=input_var)} net['conv'] = bn_conv(net['input'], num_filters=32, filter_size=3, stride=2) net['conv_1'] = bn_conv(net['conv'], num_filters=32, filter_size=3) net['conv_2'] = bn_conv(net['conv_1'], num_filters=64, filter_size=3, pad=1) net['pool'] = Pool2DLayer(net['conv_2'], pool_size=new_pool_size2, stride=2, mode=poolmode) net['conv_3'] = bn_conv(net['pool'], num_filters=80, filter_size=1) net['conv_4'] = bn_conv(net['conv_3'], num_filters=192, filter_size=3) net['pool_1'] = Pool2DLayer(net['conv_4'], pool_size=new_pool_size2, stride=2, mode=poolmode) net['mixed/join'] = inception_a( net['pool_1'], nfilt=((64,), (48, 64), (64, 96, 96), (32,))) net['mixed_1/join'] = inception_a( net['mixed/join'], nfilt=((64,), (48, 64), (64, 96, 96), (64,))) net['mixed_2/join'] = inception_a( net['mixed_1/join'], nfilt=((64,), (48, 64), (64, 96, 96), (64,))) net['mixed_3/join'] = inception_b( net['mixed_2/join'], nfilt=((384,), (64, 96, 96))) net['mixed_4/join'] = inception_c( net['mixed_3/join'], nfilt=((192,), (128, 128, 192), (128, 128, 128, 128, 192), (192,))) net['mixed_5/join'] = inception_c( net['mixed_4/join'], nfilt=((192,), (160, 160, 192), (160, 160, 160, 160, 192), (192,))) net['mixed_6/join'] = inception_c( net['mixed_5/join'], nfilt=((192,), (160, 160, 192), (160, 160, 160, 160, 192), (192,))) net['mixed_7/join'] = inception_c( net['mixed_6/join'], nfilt=((192,), (192, 192, 192), (192, 192, 192, 192, 192), (192,))) net['mixed_8/join'] = inception_d( net['mixed_7/join'], nfilt=((192, 320), (192, 192, 192, 192))) net['mixed_9/join'] = inception_e( net['mixed_8/join'], nfilt=((320,), (384, 384, 384), (448, 384, 384, 384), (192,)), pool_mode='average_exc_pad') net['mixed_10/join'] = inception_e( net['mixed_9/join'], nfilt=((320,), (384, 384, 384), (448, 384, 384, 384), (192,)), pool_mode=poolmode) net['pool3'] = GlobalPoolLayer(net['mixed_10/join']) net['softmax'] = DenseLayer( net['pool3'], num_units=1008, nonlinearity=softmax) train_output = lasagne.layers.get_output(net['softmax'], deterministic=False) train_loss = lasagne.objectives.categorical_crossentropy(train_output, target) train_loss = lasagne.objectives.aggregate(train_loss) train_err = t.mean(t.neq(t.argmax(train_output, axis=1), t.argmax(target, axis=1)), dtype=theano.config.floatX) params = lasagne.layers.get_all_params(net['softmax'], trainable=True) updates = lasagne.updates.sgd(loss_or_grads=train_loss, params=params, learning_rate=lr) test_output = lasagne.layers.get_output(net['softmax'], deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy(test_output, target) test_loss = lasagne.objectives.aggregate(test_loss) test_err = t.mean(t.neq(t.argmax(test_output, axis=1), t.argmax(target, axis=1)), dtype=theano.config.floatX) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target, lr], [train_loss, train_err], updates=updates) # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target], [test_loss, test_err]) return {'model': net['softmax'], 'train_fn': train_fn, 'val_fn': val_fn}
class Head(Layer): r""" The base class :class:`Head` represents a generic head for the Neural Turing Machine. The heads are responsible for the read/write operations on the memory. An instance of :class:`Head` outputs a weight vector defined by .. math :: \alpha_{t} &= \sigma_{alpha}(h_{t} W_{alpha} + b_{alpha})\\ k_{t} &= \sigma_{key}(h_{t} W_{key} + b_{key})\\ \beta_{t} &= \sigma_{beta}(h_{t} W_{beta} + b_{beta})\\ g_{t} &= \sigma_{gate}(h_{t} W_{gate} + b_{gate})\\ s_{t} &= \sigma_{shift}(h_{t} W_{shift} + b_{shift})\\ \gamma_{t} &= \sigma_{gamma}(h_{t} W_{gamma} + b_{gamma}) .. math :: w_{t}^{c} &= softmax(\beta_{t} * K(\alpha_{t} * k_{t}, M_{t}))\\ w_{t}^{g} &= g_{t} * w_{t}^{c} + (1 - g_{t}) * w_{t-1}\\ \tilde{w}_{t} &= s_{t} \ast w_{t}^{g}\\ w_{t} \propto \tilde{w}_{t}^{\gamma_{t}} Parameters ---------- controller: a :class:`Controller` instance The controller of the Neural Turing Machine. num_shifts: int Number of shifts allowed by the convolutional shift operation (centered on 0, eg. ``num_shifts=3`` represents shifts in [-1, 0, 1]). memory_shape: tuple Shape of the NTM's memory W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the weights for the parameter :math:`\alpha_{t}`. If ``None``, the parameter :math:`\alpha_{t}` is ignored (:math:`\alpha_{t} = 1`). Otherwise a matrix with shape ``(controller.num_units, memory_shape[1])``. b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` If callable, initializer of the biases for the parameter :math:`\alpha_{t}`. If ``None``, no bias. Otherwise a matrix with shape ``(memory_shape[1],)``. nonlinearity_sign: callable or ``None`` The nonlinearity that is applied for parameter :math:`\alpha_{t}`. If ``None``, the nonlinearity is ``identity``. W_hid_to_key: callable, Numpy array or Theano shared variable b_hid_to_key: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_key: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`k_{t}`. W_hid_to_beta: callable, Numpy array or Theano shared variable b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_beta: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\beta_{t}`. W_hid_to_gate: callable, Numpy array or Theano shared variable b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gate: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`g_{t}`. W_hid_to_shift: callable, Numpy array or Theano shared variable b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_shift: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`s_{t}`. W_hid_to_gamma: callable, Numpy array or Theano shared variable b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gamma: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\gamma_{t}` weights_init: callable, Numpy array or Theano shared variable Initializer for the initial weight vector (:math:`w_{0}`). learn_init: bool If ``True``, initial hidden values are learned. """ def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities. rectify(x), weights_init=init.OneHot(), learn_init=False, **kwargs): super(Head, self).__init__(controller, **kwargs) self.memory_shape = memory_shape self.basename = kwargs.get('name', 'head') self.learn_init = learn_init if W_hid_to_sign is not None: self.sign = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign, b=b_hid_to_sign, nonlinearity=nonlinearity_sign, name=self.basename + '.sign') self.W_hid_to_sign, self.b_hid_to_sign = self.sign.W, self.sign.b else: self.sign = None self.W_hid_to_sign, self.b_hid_to_sign = None, None self.key = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_key, b=b_hid_to_key, nonlinearity=nonlinearity_key, name=self.basename + '.key') self.W_hid_to_key, self.b_hid_to_key = self.key.W, self.key.b self.beta = DenseLayer(controller, num_units=1, W=W_hid_to_beta, b=b_hid_to_beta, nonlinearity=nonlinearity_beta, name=self.basename + '.beta') self.W_hid_to_beta, self.b_hid_to_beta = self.beta.W, self.beta.b self.gate = DenseLayer(controller, num_units=1, W=W_hid_to_gate, b=b_hid_to_gate, nonlinearity=nonlinearity_gate, name=self.basename + '.gate') self.W_hid_to_gate, self.b_hid_to_gate = self.gate.W, self.gate.b self.num_shifts = num_shifts self.shift = DenseLayer(controller, num_units=num_shifts, W=W_hid_to_shift, b=b_hid_to_shift, nonlinearity=nonlinearity_shift, name=self.basename + '.shift') self.W_hid_to_shift, self.b_hid_to_shift = self.shift.W, self.shift.b self.gamma = DenseLayer(controller, num_units=1, W=W_hid_to_gamma, b=b_hid_to_gamma, nonlinearity=nonlinearity_gamma, name=self.basename + '.gamma') self.W_hid_to_gamma, self.b_hid_to_gamma = self.gamma.W, self.gamma.b self.weights_init = self.add_param(weights_init, (1, self.memory_shape[0]), name='weights_init', trainable=learn_init, regularizable=False) def get_output_for(self, h_t, w_tm1, M_t, **kwargs): if self.sign is not None: sign_t = self.sign.get_output_for(h_t, **kwargs) else: sign_t = 1. k_t = self.key.get_output_for(h_t, **kwargs) beta_t = self.beta.get_output_for(h_t, **kwargs) g_t = self.gate.get_output_for(h_t, **kwargs) s_t = self.shift.get_output_for(h_t, **kwargs) gamma_t = self.gamma.get_output_for(h_t, **kwargs) # Content Adressing (3.3.1) beta_t = T.addbroadcast(beta_t, 1) betaK = beta_t * similarities.cosine_similarity(sign_t * k_t, M_t) w_c = lasagne.nonlinearities.softmax(betaK) # Interpolation (3.3.2) g_t = T.addbroadcast(g_t, 1) w_g = g_t * w_c + (1. - g_t) * w_tm1 # Convolutional Shift (3.3.2) w_g_padded = w_g.dimshuffle(0, 'x', 'x', 1) conv_filter = s_t.dimshuffle(0, 'x', 'x', 1) pad = (self.num_shifts // 2, (self.num_shifts - 1) // 2) w_g_padded = padding.pad(w_g_padded, [pad], batch_ndim=3) convolution = T.nnet.conv2d( w_g_padded, conv_filter, input_shape=(self.input_shape[0], 1, 1, self.memory_shape[0] + pad[0] + pad[1]), filter_shape=(self.input_shape[0], 1, 1, self.num_shifts), subsample=(1, 1), border_mode='valid') w_tilde = convolution[:, 0, 0, :] # Sharpening (3.3.2) gamma_t = T.addbroadcast(gamma_t, 1) w = T.pow(w_tilde + 1e-6, gamma_t) w /= T.sum(w) return w def get_params(self, **tags): params = super(Head, self).get_params(**tags) if self.sign is not None: params += self.sign.get_params(**tags) params += self.key.get_params(**tags) params += self.beta.get_params(**tags) params += self.gate.get_params(**tags) params += self.shift.get_params(**tags) params += self.gamma.get_params(**tags) return params
def train(images, labels, fold, model_type, mark=0, batch_size=32, num_epochs=5): """ A sample training function which loops over the training set and evaluates the network on the validation set after each epoch. Evaluates the network on the training set whenever the :param images: input images :param labels: target labels :param fold: tuple of (train, test) index numbers :param model_type: model type ('cnn', '1dconv', 'maxpool', 'lstm', 'mix') :param batch_size: batch size for training :param num_epochs: number of epochs of dataset to go over for training :return: none """ #train(images, np.squeeze(feats[:, -1]) - 1, fold_pairs[2], 'cnn') num_classes = len(np.unique(labels)) (X_train, y_train), (X_val, y_val), (X_test, y_test) = reformatInput(images, labels, fold) X_train = X_train.astype("float32", casting='unsafe') X_val = X_val.astype("float32", casting='unsafe') X_test = X_test.astype("float32", casting='unsafe') # Prepare Theano variables for inputs and targets input_var = T.TensorType('floatX', ((False, ) * 5))() target_var = T.ivector('targets') print(X_train.shape) # Create neural network model (depending on first command line parameter) print("Building model and compiling functions...") # Building the appropriate model if model_type == '1dconv': network = build_convpool_conv1d(input_var, num_classes) elif model_type == 'maxpool': network = build_convpool_max(input_var, num_classes) elif model_type == 'lstm': network = build_convpool_lstm(input_var, num_classes, 100) elif model_type == 'mix': network = build_convpool_mix(input_var, num_classes, 100) elif model_type == 'cnn': input_var = T.tensor4('inputs') network, _ = build_cnn(input_var) network = DenseLayer(lasagne.layers.dropout(network, p=.5), num_units=256, nonlinearity=lasagne.nonlinearities.rectify) network = DenseLayer(lasagne.layers.dropout(network, p=.5), num_units=num_classes, nonlinearity=lasagne.nonlinearities.softmax) else: raise ValueError( "Model not supported ['1dconv', 'maxpool', 'lstm', 'mix', 'cnn']") # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): # print("network test") # print(network.shape) prediction = lasagne.layers.get_output(network) loss = lasagne.objectives.categorical_crossentropy(prediction, target_var) loss = loss.mean() params = lasagne.layers.get_all_params(network, trainable=True) updates = lasagne.updates.adam(loss, params, learning_rate=0.001) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) test_loss = test_loss.mean() # as a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) # compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [test_loss, test_acc]) # Finally, launch the training loop. print("Starting training...") best_validation_accu = 0 # We iterate over epochs: for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=False): #if train_batches == 1: #print(y_train) #print(X_train) inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) val_err += err val_acc += acc val_batches += 1 av_train_err = train_err / train_batches av_val_err = val_err / val_batches av_val_acc = val_acc / val_batches # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(av_train_err)) print(" validation loss:\t\t{:.6f}".format(av_val_err)) print(" validation accuracy:\t\t{:.2f} %".format(av_val_acc * 100)) if av_val_acc > best_validation_accu: best_validation_accu = av_val_acc # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, batch_size, shuffle=False): inputs, targets = batch err, acc = val_fn(inputs, targets) test_err += err test_acc += acc test_batches += 1 av_test_err = test_err / test_batches av_test_acc = test_acc / test_batches print("Final results:") print(" test loss:\t\t\t{:.6f}".format(av_test_err)) print(" test accuracy:\t\t{:.2f} %".format(av_test_acc * 100)) # Dump the network weights to a file like this: np.savez('weights_lasg_{}_{}'.format(model_type, mark), *lasagne.layers.get_all_param_values(network)) print('-' * 50) print("Best validation accuracy:\t\t{:.2f} %".format(best_validation_accu * 100)) print("Best test accuracy:\t\t{:.2f} %".format(av_test_acc * 100))
class WriteHead(Head): r""" Write head. In addition to the weight vector, the write head also outputs an add vector :math:`a_{t}` and an erase vector :math:`e_{t}` defined by .. math :: \delta_{t} &= \sigma_{delta}(h_{t} W_{delta} + b_{delta})\\ a_{t} &= \delta_{t} * \sigma_{a}(h_{t} W_{a} + b_{a}) e_{t} &= \sigma_{e}(h_{t} W_{e} + b_{e}) Parameters ---------- controller: a :class:`Controller` instance The controller of the Neural Turing Machine. num_shifts: int Number of shifts allowed by the convolutional shift operation (centered on 0, eg. ``num_shifts=3`` represents shifts in [-1, 0, 1]). memory_shape: tuple Shape of the NTM's memory W_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` b_hid_to_sign: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_sign: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\alpha_{t}`. W_hid_to_key: callable, Numpy array or Theano shared variable b_hid_to_key: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_key: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`k_{t}`. W_hid_to_beta: callable, Numpy array or Theano shared variable b_hid_to_beta: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_beta: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\beta_{t}`. W_hid_to_gate: callable, Numpy array or Theano shared variable b_hid_to_gate: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gate: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`g_{t}`. W_hid_to_shift: callable, Numpy array or Theano shared variable b_hid_to_shift: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_shift: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`s_{t}`. W_hid_to_gamma: callable, Numpy array or Theano shared variable b_hid_to_gamma: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_gamma: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\gamma_{t}` W_hid_to_erase: callable, Numpy array or Theano shared variable b_hid_to_erase: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_erase: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`e_{t}` W_hid_to_add: callable, Numpy array or Theano shared variable b_hid_to_add: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_add: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`a_{t}` W_hid_to_sign_add: callable, Numpy array, Theano shared variable, or ``None`` b_hid_to_sign_add: callable, Numpy array, Theano shared variable or ``None`` nonlinearity_sign_add: callable or ``None`` Weights, biases and nonlinearity for parameter :math:`\delta_{t}` weights_init: callable, Numpy array or Theano shared variable Initializer for the initial weight vector (:math:`w_{0}`). learn_init: bool If ``True``, initial hidden values are learned. """ def __init__(self, controller, num_shifts=3, memory_shape=(128, 20), W_hid_to_sign=None, b_hid_to_sign=lasagne.init.Constant(0.), nonlinearity_sign=nonlinearities.ClippedLinear(low=-1., high=1.), W_hid_to_key=lasagne.init.GlorotUniform(), b_hid_to_key=lasagne.init.Constant(0.), nonlinearity_key=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_beta=lasagne.init.GlorotUniform(), b_hid_to_beta=lasagne.init.Constant(0.), nonlinearity_beta=lasagne.nonlinearities.rectify, W_hid_to_gate=lasagne.init.GlorotUniform(), b_hid_to_gate=lasagne.init.Constant(0.), nonlinearity_gate=nonlinearities.hard_sigmoid, W_hid_to_shift=lasagne.init.GlorotUniform(), b_hid_to_shift=lasagne.init.Constant(0.), nonlinearity_shift=lasagne.nonlinearities.softmax, W_hid_to_gamma=lasagne.init.GlorotUniform(), b_hid_to_gamma=lasagne.init.Constant(0.), nonlinearity_gamma=lambda x: 1. + lasagne.nonlinearities. rectify(x), W_hid_to_erase=lasagne.init.GlorotUniform(), b_hid_to_erase=lasagne.init.Constant(0.), nonlinearity_erase=nonlinearities.hard_sigmoid, W_hid_to_add=lasagne.init.GlorotUniform(), b_hid_to_add=lasagne.init.Constant(0.), nonlinearity_add=nonlinearities.ClippedLinear(low=0., high=1.), W_hid_to_sign_add=None, b_hid_to_sign_add=lasagne.init.Constant(0.), nonlinearity_sign_add=nonlinearities.ClippedLinear(low=-1., high=1.), weights_init=init.OneHot(), learn_init=False, **kwargs): super(WriteHead, self).__init__(controller, num_shifts=num_shifts, memory_shape=memory_shape, W_hid_to_sign=W_hid_to_sign, b_hid_to_sign=b_hid_to_sign, nonlinearity_sign=nonlinearity_sign, W_hid_to_key=W_hid_to_key, b_hid_to_key=b_hid_to_key, nonlinearity_key=nonlinearity_key, W_hid_to_beta=W_hid_to_beta, b_hid_to_beta=b_hid_to_beta, nonlinearity_beta=nonlinearity_beta, W_hid_to_gate=W_hid_to_gate, b_hid_to_gate=b_hid_to_gate, nonlinearity_gate=nonlinearity_gate, W_hid_to_shift=W_hid_to_shift, b_hid_to_shift=b_hid_to_shift, nonlinearity_shift=nonlinearity_shift, W_hid_to_gamma=W_hid_to_gamma, b_hid_to_gamma=b_hid_to_gamma, nonlinearity_gamma=nonlinearity_gamma, weights_init=weights_init, learn_init=learn_init, **kwargs) self.erase = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_erase, b=b_hid_to_erase, nonlinearity=nonlinearity_erase, name=self.basename + '.erase') self.W_hid_to_erase, self.b_hid_to_erase = self.erase.W, self.erase.b self.add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_add, b=b_hid_to_add, nonlinearity=nonlinearity_add, name=self.basename + '.add') self.W_hid_to_add, self.b_hid_to_add = self.add.W, self.add.b if W_hid_to_sign_add is not None: self.sign_add = DenseLayer(controller, num_units=self.memory_shape[1], W=W_hid_to_sign_add, b=b_hid_to_sign_add, nonlinearity=nonlinearity_sign_add, name=self.basename + '.sign_add') self.W_hid_to_sign_add, self.b_hid_to_sign_add = self.sign_add.W, self.sign_add.b else: self.sign_add = None self.W_hid_to_sign_add, self.b_hid_to_sign_add = None, None def get_params(self, **tags): params = super(WriteHead, self).get_params(**tags) params += self.erase.get_params(**tags) params += self.add.get_params(**tags) if self.sign_add is not None: params += self.sign_add.get_params(**tags) return params
num_classes = nClasses + 1 soft = lasagne.nonlinearities.softmax tanh = lasagne.nonlinearities.tanh identity = lasagne.nonlinearities.identity l_in = InputLayer(shape=(num_batch, input_seq_len, num_feat)) batchsize, seqlen, _ = l_in.input_var.shape l_noise = GaussianNoiseLayer(l_in, sigma=0.6) # l_mask = InputLayer(shape=(batchsize, seqlen)) # l_rnn_1 = LSTMLayer(l_noise, num_units=L1_UNITS, mask_input=l_mask) l_rnn_1 = LSTMLayer(l_noise, num_units=L1_UNITS) l_rnn_2 = LSTMLayer(l_rnn_1, num_units=L2_UNITS) l_shp = ReshapeLayer(l_rnn_2, (-1, L2_UNITS)) l_out = DenseLayer(l_shp, num_units=num_classes, nonlinearity=identity) l_out_shp = ReshapeLayer(l_out, (batchsize, seqlen, num_classes)) l_out_softmax = NonlinearityLayer(l_out, nonlinearity=soft) l_out_softmax_shp = ReshapeLayer(l_out_softmax, (batchsize, seqlen, num_classes)) output_lin_ctc = L.get_output(l_out_shp) network_output = L.get_output(l_out_softmax_shp) all_params = L.get_all_params(l_rnn_2, trainable=True) # ## Costs, Gradients & Training Functions # Cost functions target_values = T.imatrix('target_output') input_values = T.imatrix()
def build_network_resnet50(input, nbClasses): net = {} net['input'] = InputLayer(shape=(None, 1, 120, 120), input_var=input) sub_net, parent_layer_name = build_simple_block( net['input'], ['conv1', 'bn_conv1', 'conv1_relu'], 64, 7, 3, 2, use_bias=True) net.update(sub_net) net['pool1'] = PoolLayer(net[parent_layer_name], pool_size=3, stride=2, pad=0, mode='max', ignore_border=False) block_size = list('abc') parent_layer_name = 'pool1' for c in block_size: if c == 'a': sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1, 1, True, 4, ix='2%s' % c) else: sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='2%s' % c) net.update(sub_net) block_size = list('abcd') for c in block_size: if c == 'a': sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 2, 1.0 / 2, True, 4, ix='3%s' % c) else: sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='3%s' % c) net.update(sub_net) block_size = list('abcdef') for c in block_size: if c == 'a': sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 2, 1.0 / 2, True, 4, ix='4%s' % c) else: sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='4%s' % c) net.update(sub_net) block_size = list('abc') for c in block_size: if c == 'a': sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 2, 1.0 / 2, True, 4, ix='5%s' % c) else: sub_net, parent_layer_name = build_residual_block( net[parent_layer_name], 1.0 / 4, 1, False, 4, ix='5%s' % c) net.update(sub_net) net['pool5'] = PoolLayer(net[parent_layer_name], pool_size=7, stride=1, pad=0, mode='average_exc_pad', ignore_border=False) net['fc1000'] = DenseLayer( net['pool5'], num_units=nbClasses, nonlinearity=None) # number output units = nbClasses (global variable) net['prob'] = NonlinearityLayer(net['fc1000'], nonlinearity=softmax) return net, net['prob']
def test_space_invaders( game_title='SpaceInvaders-v0', n_parallel_games=3, replay_seq_len=2, ): """ :param game_title: name of atari game in Gym :param n_parallel_games: how many games we run in parallel :param replay_seq_len: how long is one replay session from a batch """ atari = gym.make(game_title) atari.reset() # Game Parameters n_actions = atari.action_space.n observation_shape = (None, ) + atari.observation_space.shape action_names = atari.get_action_meanings() del atari # ##### Agent observations # image observation at current tick goes here observation_layer = InputLayer(observation_shape, name="images input") # reshape to [batch, color, x, y] to allow for convolutional layers to work correctly observation_reshape = DimshuffleLayer(observation_layer, (0, 3, 1, 2)) # Agent memory states window_size = 3 # prev state input prev_window = InputLayer( (None, window_size) + tuple(observation_reshape.output_shape[1:]), name="previous window state") # our window window = WindowAugmentation(observation_reshape, prev_window, name="new window state") memory_dict = {window: prev_window} # ##### Neural network body # you may use any other lasagne layers, including convolutions, batch_norms, maxout, etc # pixel-wise maximum over the temporal window (to avoid flickering) window_max = ExpressionLayer(window, lambda a: a.max(axis=1), output_shape=(None, ) + window.output_shape[2:]) # a simple lasagne network (try replacing with any other lasagne network and see what works best) nn = DenseLayer(window_max, num_units=50, name='dense0') # Agent policy and action picking q_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.linear, name="QEvaluator") #fakes for a2c policy_eval = DenseLayer(nn, num_units=n_actions, nonlinearity=lasagne.nonlinearities.softmax, name="a2c action probas") state_value_eval = DenseLayer(nn, num_units=1, nonlinearity=None, name="a2c state values") # resolver resolver = ProbabilisticResolver(policy_eval, name="resolver") # agent agent = Agent(observation_layer, memory_dict, (q_eval, policy_eval, state_value_eval), resolver) # Since it's a single lasagne network, one can get it's weights, output, etc weights = lasagne.layers.get_all_params(resolver, trainable=True) # Agent step function # # Create and manage a pool of atari sessions to play with pool = EnvPool(agent, game_title, n_parallel_games) observation_log, action_log, reward_log, _, _, _ = pool.interact(50) print(np.array(action_names)[np.array(action_log)[:3, :5]]) # # experience replay pool # Create an environment with all default parameters env = SessionPoolEnvironment(observations=observation_layer, actions=resolver, agent_memories=agent.agent_states) def update_pool(env, pool, n_steps=100): """ a function that creates new sessions and ads them into the pool throwing the old ones away entirely for simplicity""" preceding_memory_states = list(pool.prev_memory_states) # get interaction sessions observation_tensor, action_tensor, reward_tensor, _, is_alive_tensor, _ = pool.interact( n_steps=n_steps) # load them into experience replay environment env.load_sessions(observation_tensor, action_tensor, reward_tensor, is_alive_tensor, preceding_memory_states) # load first sessions update_pool(env, pool, replay_seq_len) # A more sophisticated way of training is to store a large pool of sessions and train on random batches of them. # ### Training via experience replay # get agent's Q-values, policy, etc obtained via experience replay _env_states, _observations, _memories, _imagined_actions, estimators = agent.get_sessions( env, session_length=replay_seq_len, batch_size=env.batch_size, optimize_experience_replay=True, ) (q_values_sequence, policy_sequence, value_sequence) = estimators # Evaluating loss function scaled_reward_seq = env.rewards # For SpaceInvaders, however, not scaling rewards is at least working elwise_mse_loss = 0. #1-step algos for algo in qlearning, sarsa: elwise_mse_loss += algo.get_elementwise_objective( q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, ) #qlearning_n_step for n in (1, 3, replay_seq_len - 1, replay_seq_len, replay_seq_len + 1, None): elwise_mse_loss += qlearning_n_step.get_elementwise_objective( q_values_sequence, env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, n_steps=n) #a2c n_step elwise_mse_loss += a2c_n_step.get_elementwise_objective( policy_sequence, value_sequence[:, :, 0], env.actions[0], scaled_reward_seq, env.is_alive, gamma_or_gammas=0.99, n_steps=3) # compute mean over "alive" fragments mse_loss = elwise_mse_loss.sum() / env.is_alive.sum() # regularize network weights reg_l2 = regularize_network_params(resolver, l2) * 10**-4 loss = mse_loss + reg_l2 # Compute weight updates updates = lasagne.updates.adadelta(loss, weights, learning_rate=0.01) # mean session reward mean_session_reward = env.rewards.sum(axis=1).mean() # # Compile train and evaluation functions print('compiling') train_fun = theano.function([], [loss, mean_session_reward], updates=updates) evaluation_fun = theano.function( [], [loss, mse_loss, reg_l2, mean_session_reward]) print("I've compiled!") # # Training loop for epoch_counter in range(10): update_pool(env, pool, replay_seq_len) loss, avg_reward = train_fun() full_loss, q_loss, l2_penalty, avg_reward_current = evaluation_fun() print("epoch %i,loss %.5f, rewards: %.5f " % (epoch_counter, full_loss, avg_reward_current)) print("rec %.3f reg %.3f" % (q_loss, l2_penalty))
def __init__(self, number_words, num_hidden, seq_length, mb_size): self.mb_size = mb_size x = T.imatrix() #sequence x minibatch x index one_hot_input = T.ftensor3() use_one_hot_input_flag = T.scalar() self.indices = x self.use_one_hot_input_flag = use_one_hot_input_flag self.one_hot_input = one_hot_input ''' flag for input: one-hot or index. If index, compute one-hot and use that. If one-hot, just use one-hot input. ''' #Time seq x examples x words target = T.ivector() #word_embeddings = theano.shared(np.random.normal(size = ((number_words, 1, num_hidden))).astype('float32')) word_embeddings = theano.shared(np.random.normal(size = ((number_words, num_hidden))).astype('float32')) feature_lst = [] for i in range(0, seq_length): #feature = word_embeddings[x[:,i]] #instead of this, multiply by one-hot matrix one_hot = T.extra_ops.to_one_hot(x[:,i], number_words) #W : 30k x 1 x 400 #one_hot: 128 x 30k #one_hot * W #128 x 1 x 400 one_hot_use = ifelse(use_one_hot_input_flag, one_hot_input[i], T.extra_ops.to_one_hot(x[:,i], number_words)) feature = T.reshape(T.dot(one_hot_use, word_embeddings), (1,mb_size,num_hidden)).transpose(1,0,2) feature_lst.append(feature) features = T.concatenate(feature_lst, 1) #example x sequence_position x feature l_lstm_1 = LSTMLayer((seq_length, mb_size, num_hidden), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0) l_lstm_2 = LSTMLayer((seq_length, mb_size, num_hidden * 2), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0, backwards = True) l_lstm_3 = LSTMLayer((seq_length, mb_size, num_hidden * 2), num_units = num_hidden, nonlinearity = lasagne.nonlinearities.tanh, grad_clipping=100.0) lstm_1_out = l_lstm_1.get_output_for([features]) lstm_2_out = l_lstm_2.get_output_for([T.concatenate([lstm_1_out, features], axis = 2)]) lstm_3_out = l_lstm_3.get_output_for([T.concatenate([lstm_2_out, features], axis = 2)]) final_out = T.mean(lstm_3_out, axis = 1) #final_out = T.mean(features, axis = 1) h_out_1 = DenseLayer((mb_size, num_hidden), num_units = 2048, nonlinearity=lasagne.nonlinearities.rectify) h_out_2 = DenseLayer((mb_size, 2048), num_units = 2048, nonlinearity=lasagne.nonlinearities.rectify) h_out_3 = DenseLayer((mb_size, 2048), num_units = 1, nonlinearity=None) h_out_1_value = h_out_1.get_output_for(final_out) h_out_2_value = h_out_2.get_output_for(h_out_1_value) h_out_3_value = h_out_3.get_output_for(h_out_2_value) classification = T.nnet.sigmoid(h_out_3_value) self.loss = T.mean(T.nnet.binary_crossentropy(output = classification.flatten(), target = target)) self.params = lasagne.layers.get_all_params(h_out_1,trainable=True) + lasagne.layers.get_all_params(h_out_3,trainable=True) + [word_embeddings] + lasagne.layers.get_all_params(l_lstm_1, trainable = True) + lasagne.layers.get_all_params(l_lstm_2, trainable = True) self.params += lasagne.layers.get_all_params(h_out_2,trainable=True) self.params += lasagne.layers.get_all_params(l_lstm_3,trainable=True) all_grads = T.grad(self.loss, self.params) for j in range(0, len(all_grads)): all_grads[j] = T.switch(T.isnan(all_grads[j]), T.zeros_like(all_grads[j]), all_grads[j]) scaled_grads = lasagne.updates.total_norm_constraint(all_grads, 5.0) updates = lasagne.updates.adam(scaled_grads, self.params) self.train_func = theano.function(inputs = [x, target, use_one_hot_input_flag, one_hot_input], outputs = {'l' : self.loss, 'c' : classification, 'g_w' : T.sum(T.sqr(T.grad(self.loss, word_embeddings)))}, updates = updates) self.evaluate_func = theano.function(inputs = [x, use_one_hot_input_flag, one_hot_input], outputs = {'c' : classification})