def create_base_model(self, x, y, input_dim, interim_dim=30): # Create the output of the MLP mlp = MLP([Tanh(), Tanh(), Tanh()], [input_dim, 60, 60, interim_dim], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0)) mlp.initialize() inter = mlp.apply(x) fine_tuner = MLP([Logistic()], [interim_dim, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0)) fine_tuner.initialize() probs = fine_tuner.apply(inter) #sq_err = BinaryCrossEntropy() err = T.sqr(y.flatten() - probs.flatten()) # cost = T.mean(err * y.flatten() * (1 - self.p) + err * # (1 - y.flatten()) * self.p) cost = T.mean(err) #cost = sq_err.apply(probs.flatten(), y.flatten()) # cost = T.mean(y.flatten() * T.log(probs.flatten()) + # (1 - y.flatten()) * T.log(1 - probs.flatten())) cost.name = 'cost' pred_out = probs > 0.5 mis_cost = T.sum(T.neq(y.flatten(), pred_out.flatten())) mis_cost.name = 'MisclassificationRate' return mlp, fine_tuner, cost, mis_cost
def create_vae(x=None, batch=batch_size): x = T.matrix('features') if x is None else x x = x / 255. encoder = MLP( activations=[Rectifier(), Logistic()], dims=[img_dim**2, hidden_dim, 2*latent_dim], weights_init=IsotropicGaussian(std=0.01, mean=0), biases_init=Constant(0.01), name='encoder' ) encoder.initialize() z_param = encoder.apply(x) z_mean, z_log_std = z_param[:,latent_dim:], z_param[:,:latent_dim] z = Sampling(theano_seed=seed).apply([z_mean, z_log_std], batch=batch_size) decoder = MLP( activations=[Rectifier(), Logistic()], dims=[latent_dim, hidden_dim, img_dim**2], weights_init=IsotropicGaussian(std=0.01, mean=0), biases_init=Constant(0.01), name='decoder' ) decoder.initialize() x_reconstruct = decoder.apply(z) cost = VAEloss().apply(x, x_reconstruct, z_mean, z_log_std) cost.name = 'vae_cost' return cost
def build_mlp(features_cat, features_int, labels): mlp_int = MLP(activations=[Rectifier(), Rectifier()], dims=[19, 50, 50], weights_init=IsotropicGaussian(), biases_init=Constant(0), name='mlp_interval') mlp_int.initialize() mlp_cat = MLP(activations=[Logistic()], dims=[320, 50], weights_init=IsotropicGaussian(), biases_init=Constant(0), name='mlp_categorical') mlp_cat.initialize() mlp = MLP(activations=[Rectifier(), None], dims=[50, 50, 1], weights_init=IsotropicGaussian(), biases_init=Constant(0)) mlp.initialize() gated = mlp_cat.apply(features_cat) * mlp_int.apply(features_int) prediction = mlp.apply(gated) cost = MAPECost().apply(prediction, labels) cg = ComputationGraph(cost) print cg.variables cg_dropout1 = apply_dropout(cg, [VariableFilter(roles=[OUTPUT])(cg.variables)[1], VariableFilter(roles=[OUTPUT])(cg.variables)[3]], .2) cost_dropout1 = cg_dropout1.outputs[0] return cost_dropout1, cg_dropout1.parameters, cost
class AttentionReader(Initializable): def __init__(self, x_dim, dec_dim, channels, height, width, N, **kwargs): super(AttentionReader, self).__init__(name="reader", **kwargs) self.img_height = height self.img_width = width self.N = N self.x_dim = x_dim self.dec_dim = dec_dim self.output_dim = 2*channels*N*N self.zoomer = ZoomableAttentionWindow(channels, height, width, N) self.readout = MLP(activations=[Identity()], dims=[dec_dim, 5], **kwargs) self.children = [self.readout] def get_dim(self, name): if name == 'input': return self.dec_dim elif name == 'x_dim': return self.x_dim elif name == 'output': return self.output_dim else: raise ValueError @application(inputs=['x', 'x_hat', 'h_dec'], outputs=['r']) def apply(self, x, x_hat, h_dec): l = self.readout.apply(h_dec) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) w = gamma * self.zoomer.read(x , center_y, center_x, delta, sigma) w_hat = gamma * self.zoomer.read(x_hat, center_y, center_x, delta, sigma) return T.concatenate([w, w_hat], axis=1) @application(inputs=['x', 'x_hat', 'h_dec'], outputs=['r','center_y', 'center_x', 'delta']) def apply_detailed(self, x, x_hat, h_dec): l = self.readout.apply(h_dec) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) w = gamma * self.zoomer.read(x , center_y, center_x, delta, sigma) w_hat = gamma * self.zoomer.read(x_hat, center_y, center_x, delta, sigma) r = T.concatenate([w, w_hat], axis=1) return r, center_y, center_x, delta @application(inputs=['x', 'h_dec'], outputs=['r','center_y', 'center_x', 'delta']) def apply_simple(self, x, h_dec): l = self.readout.apply(h_dec) center_y, center_x, delta, sigma, gamma = self.zoomer.nn2att(l) r = gamma * self.zoomer.read(x , center_y, center_x, delta, sigma) return r, center_y, center_x, delta
def setupNN(NNParam): NNWidth = NNParam['NNWidth'] WeightStdDev = NNParam['WeightStdDev'] L2Weight = NNParam['L2Weight'] DropOutProb = NNParam['DropOutProb'] InitialLearningRate = NNParam['InitialLearningRate'] x = theano.tensor.concatenate([x0, x1, x2, x3], axis=1) mlp = MLP(activations=[Rectifier(), Rectifier(), Rectifier(), Rectifier(), Rectifier()], dims=[69*4, NNWidth, NNWidth, NNWidth, NNWidth, 100], weights_init=IsotropicGaussian(WeightStdDev), biases_init=Constant(0)) x_forward = mlp.apply(x) mlp_sm = MLP(activations=[None], dims=[100, 39], weights_init=IsotropicGaussian(WeightStdDev), biases_init=Constant(0)) y_hat_b = Softmax().apply(mlp_sm.apply(x_forward)) mlp.initialize() mlp_sm.initialize() cg = blocks.graph.ComputationGraph(y_hat_b) parameters = list() for p in cg.parameters: parameters.append(p) weights = VariableFilter(roles=[blocks.roles.WEIGHT])(cg.variables) cg_dropout = blocks.graph.apply_dropout(cg,[weights[3]] , DropOutProb) y_hat_b_do = cg_dropout.outputs[0] pred_b = theano.tensor.argmax(cg.outputs[0],axis=1) err_b = theano.tensor.mean(theano.tensor.eq(pred_b,y_b)) cW = 0 for W in weights: cW += (W**2).sum() cost = theano.tensor.mean(theano.tensor.nnet.categorical_crossentropy(y_hat_b_do, y_b)) + cW*L2Weight Learning_Rate_Decay = numpy.float32(0.98) learning_rate_theano = theano.shared(numpy.float32(InitialLearningRate), name='learning_rate') learning_rate_update = theano.function(inputs=[],outputs=learning_rate_theano,updates=[(learning_rate_theano,learning_rate_theano*Learning_Rate_Decay)]) update_proc = momentum_sgd(cost,parameters,0.8, learning_rate_theano) #train training_proc = theano.function( inputs=[shuffIdx], outputs=cost, updates=update_proc, givens={x0:tX[theano.tensor.flatten(shuffIdx[:,0])], x1:tX[theano.tensor.flatten(shuffIdx[:,1])], x2:tX[theano.tensor.flatten(shuffIdx[:,2])], x3:tX[theano.tensor.flatten(shuffIdx[:,3])], y_b:tYb[theano.tensor.flatten(shuffIdx[:,1])]}) #test test_on_testing_proc = theano.function( inputs=[shuffIdx], outputs=[err_b], givens={x0:vX[shuffIdx[:,0]],x1:vX[shuffIdx[:,1]],x2:vX[shuffIdx[:,2]],x3:vX[shuffIdx[:,3]],y_b:vYb[shuffIdx[:,1]]}) test_on_training_proc = theano.function( inputs=[shuffIdx], outputs=[err_b], givens={x0:tX[shuffIdx[:,0]],x1:tX[shuffIdx[:,1]],x2:tX[shuffIdx[:,2]],x3:tX[shuffIdx[:,3]],y_b:tYb[shuffIdx[:,1]]}) forward_proc = theano.function(inputs=[x0,x1,x2,x3],outputs=[x_forward]) return (learning_rate_update, training_proc, test_on_testing_proc,test_on_training_proc,forward_proc)
def test_model(): x = tensor.matrix('x') mlp1 = MLP([Tanh(), Tanh()], [10, 20, 30], name="mlp1") mlp2 = MLP([Tanh()], [30, 40], name="mlp2") h1 = mlp1.apply(x) h2 = mlp2.apply(h1) model = Model(h2) assert model.get_top_bricks() == [mlp1, mlp2] # The order of parameters returned is deterministic but # not sensible. assert list(model.get_parameter_dict().items()) == [ ('/mlp2/linear_0.b', mlp2.linear_transformations[0].b), ('/mlp1/linear_1.b', mlp1.linear_transformations[1].b), ('/mlp1/linear_0.b', mlp1.linear_transformations[0].b), ('/mlp1/linear_0.W', mlp1.linear_transformations[0].W), ('/mlp1/linear_1.W', mlp1.linear_transformations[1].W), ('/mlp2/linear_0.W', mlp2.linear_transformations[0].W) ] # Test getting and setting parameter values mlp3 = MLP([Tanh()], [10, 10]) mlp3.allocate() model3 = Model(mlp3.apply(x)) parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones( (10, 10), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(10, dtype=theano.config.floatX) } model3.set_parameter_values(parameter_values) assert numpy.all( mlp3.linear_transformations[0].parameters[0].get_value() == 2) assert numpy.all( mlp3.linear_transformations[0].parameters[1].get_value() == 3) got_parameter_values = model3.get_parameter_values() assert len(got_parameter_values) == len(parameter_values) for name, value in parameter_values.items(): assert_allclose(value, got_parameter_values[name]) # Test exception is raised if parameter shapes don't match def helper(): parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones( (11, 11), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(11, dtype=theano.config.floatX) } model3.set_parameter_values(parameter_values) assert_raises(ValueError, helper) # Test name conflict handling mlp4 = MLP([Tanh()], [10, 10]) def helper(): Model(mlp4.apply(mlp3.apply(x))) assert_raises(ValueError, helper)
class MLP_conv_dense(Initializable): def __init__(self, n_layers_conv, n_layers_dense_lower, n_layers_dense_upper, n_hidden_conv, n_hidden_dense_lower, n_hidden_dense_lower_output, n_hidden_dense_upper, spatial_width, n_colors, n_temporal_basis): """ The multilayer perceptron, that provides temporal weighting coefficients for mu and sigma images. This consists of a lower segment with a convolutional MLP, and optionally with a dense MLP in parallel. The upper segment then consists of a per-pixel dense MLP (convolutional MLP with 1x1 kernel). """ super(MLP_conv_dense, self).__init__() self.n_colors = n_colors self.spatial_width = spatial_width self.n_hidden_dense_lower = n_hidden_dense_lower self.n_hidden_dense_lower_output = n_hidden_dense_lower_output self.n_hidden_conv = n_hidden_conv ## the lower layers self.mlp_conv = MultiLayerConvolution(n_layers_conv, n_hidden_conv, spatial_width, n_colors) self.children = [self.mlp_conv] if n_hidden_dense_lower > 0 and n_layers_dense_lower > 0: n_input = n_colors*spatial_width**2 n_output = n_hidden_dense_lower_output*spatial_width**2 self.mlp_dense_lower = MLP([dense_nonlinearity] * n_layers_conv, [n_input] + [n_hidden_dense_lower] * (n_layers_conv-1) + [n_output], name='MLP dense lower', weights_init=Orthogonal(), biases_init=Constant(0)) self.children.append(self.mlp_dense_lower) else: n_hidden_dense_lower_output = 0 ## the upper layers (applied to each pixel independently) n_output = n_colors*n_temporal_basis*2 # "*2" for both mu and sigma self.mlp_dense_upper = MLP([dense_nonlinearity] * (n_layers_dense_upper-1) + [Identity()], [n_hidden_conv+n_hidden_dense_lower_output] + [n_hidden_dense_upper] * (n_layers_dense_upper-1) + [n_output], name='MLP dense upper', weights_init=Orthogonal(), biases_init=Constant(0)) self.children.append(self.mlp_dense_upper) @application def apply(self, X): """ Take in noisy input image and output temporal coefficients for mu and sigma. """ Y = self.mlp_conv.apply(X) Y = Y.dimshuffle(0,2,3,1) if self.n_hidden_dense_lower > 0: n_images = X.shape[0] X = X.reshape((n_images, self.n_colors*self.spatial_width**2)) Y_dense = self.mlp_dense_lower.apply(X) Y_dense = Y_dense.reshape((n_images, self.spatial_width, self.spatial_width, self.n_hidden_dense_lower_output)) Y = T.concatenate([Y/T.sqrt(self.n_hidden_conv), Y_dense/T.sqrt(self.n_hidden_dense_lower_output)], axis=3) Z = self.mlp_dense_upper.apply(Y) return Z
class GRUInitialStateWithInitialStateSumContext(GatedRecurrent): """Gated Recurrent with special initial state. Initial state of Gated Recurrent is set by an MLP that conditions on the last hidden state of the bidirectional encoder, applies an affine transformation followed by a tanh non-linearity to set initial state. """ def __init__(self, attended_dim, context_dim, **kwargs): super(GRUInitialStateWithInitialStateSumContext, self).__init__(**kwargs) self.attended_dim = attended_dim self.context_dim = context_dim # two MLPs which map to the same dimension, then we sum # the motivation here is to allow the network to pretrain on the normal MT, task, # then keep some params static, and continue training with the context-enhanced task # the state transformer self.initial_transformer = MLP(activations=[Tanh()], dims=[attended_dim, self.dim], name='state_initializer') # the context transformer self.context_transformer = MLP( activations=[Tanh(), Tanh(), Tanh()], dims=[context_dim, 2000, 1000, self.dim], name='context_initializer') self.children.extend( [self.initial_transformer, self.context_transformer]) # THINKING: how to best combine the image info with the source info? @application def initial_states(self, batch_size, *args, **kwargs): attended = kwargs['attended'] context = kwargs['initial_state_context'] attended_reverse_final_state = attended[0, :, -self.attended_dim:] initial_state_representation = self.initial_transformer.apply( attended_reverse_final_state) initial_context_representation = self.context_transformer.apply( context) initial_state = initial_state_representation + initial_context_representation return initial_state def _allocate(self): self.parameters.append( shared_floatx_nans((self.dim, self.dim), name='state_to_state')) self.parameters.append( shared_floatx_nans((self.dim, 2 * self.dim), name='state_to_gates')) for i in range(2): if self.parameters[i]: add_role(self.parameters[i], WEIGHT)
def build_mlp(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels): mlp_car = MLP(activations=[Rectifier(), Rectifier(), None], dims=[8 + 185, 200, 200, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp_interval_car') mlp_car.initialize() mlp_nocar = MLP(activations=[Rectifier(), Rectifier(), None], dims=[5 + 135, 200, 200, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp_interval_nocar') mlp_nocar.initialize() feature_car = tensor.concatenate((features_car_cat, features_car_int), axis=1) feature_nocar = tensor.concatenate( (features_nocar_cat, features_nocar_int), axis=1) prediction = mlp_nocar.apply(feature_nocar) # gating with the last feature : does the dude own a car prediction += tensor.addbroadcast(features_hascar, 1) * mlp_car.apply(feature_car) prediction_loc, _, _, _, = \ build_mlp_onlyloc(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels) prediction += prediction_loc # add crm mlp_crm = MLP(activations=[None], dims=[1, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp_crm') mlp_crm.initialize() crm = features_nocar_int[:, 0][:, None] prediction = prediction * mlp_crm.apply(crm) cost = MAPECost().apply(labels, prediction) cg = ComputationGraph(cost) input_var = VariableFilter(roles=[INPUT])(cg.variables) print input_var cg_dropout1 = apply_dropout(cg, [input_var[6], input_var[7]], .4) cost_dropout1 = cg_dropout1.outputs[0] return prediction, cost_dropout1, cg_dropout1.parameters, cost
def test_model(): x = tensor.matrix('x') mlp1 = MLP([Tanh(), Tanh()], [10, 20, 30], name="mlp1") mlp2 = MLP([Tanh()], [30, 40], name="mlp2") h1 = mlp1.apply(x) h2 = mlp2.apply(h1) model = Model(h2) assert model.get_top_bricks() == [mlp1, mlp2] # The order of parameters returned is deterministic but # not sensible. assert list(model.get_parameter_dict().items()) == [ ('/mlp2/linear_0.b', mlp2.linear_transformations[0].b), ('/mlp1/linear_1.b', mlp1.linear_transformations[1].b), ('/mlp1/linear_0.b', mlp1.linear_transformations[0].b), ('/mlp1/linear_0.W', mlp1.linear_transformations[0].W), ('/mlp1/linear_1.W', mlp1.linear_transformations[1].W), ('/mlp2/linear_0.W', mlp2.linear_transformations[0].W)] # Test getting and setting parameter values mlp3 = MLP([Tanh()], [10, 10]) mlp3.allocate() model3 = Model(mlp3.apply(x)) parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones((10, 10), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(10, dtype=theano.config.floatX)} model3.set_parameter_values(parameter_values) assert numpy.all( mlp3.linear_transformations[0].parameters[0].get_value() == 2) assert numpy.all( mlp3.linear_transformations[0].parameters[1].get_value() == 3) got_parameter_values = model3.get_parameter_values() assert len(got_parameter_values) == len(parameter_values) for name, value in parameter_values.items(): assert_allclose(value, got_parameter_values[name]) # Test exception is raised if parameter shapes don't match def helper(): parameter_values = { '/mlp/linear_0.W': 2 * numpy.ones((11, 11), dtype=theano.config.floatX), '/mlp/linear_0.b': 3 * numpy.ones(11, dtype=theano.config.floatX)} model3.set_parameter_values(parameter_values) assert_raises(ValueError, helper) # Test name conflict handling mlp4 = MLP([Tanh()], [10, 10]) def helper(): Model(mlp4.apply(mlp3.apply(x))) assert_raises(ValueError, helper)
class DGSRNN(BaseRecurrent, Initializable): def __init__(self, input_dim, state_dim, act, transition_h, tr_h_activations, **kwargs): super(DGSRNN, self).__init__(**kwargs) self.input_dim = input_dim self.state_dim = state_dim logistic = Logistic() self.inter = MLP(dims=[input_dim + state_dim] + transition_h, activations=tr_h_activations, name='inter') self.reset = MLP(dims=[transition_h[-1], state_dim], activations=[logistic], name='reset') self.update = MLP(dims=[transition_h[-1], state_dim], activations=[act], name='update') self.children = [self.inter, self.reset, self.update, logistic, act] + tr_h_activations # init state self.params = [shared_floatx_zeros((state_dim,), name='init_state')] add_role(self.params[0], INITIAL_STATE) def get_dim(self, name): if name == 'state': return self.state_dim return super(GFGRU, self).get_dim(name) @recurrent(sequences=['inputs', 'drop_updates_mask'], states=['state'], outputs=['state', 'reset'], contexts=[]) def apply(self, inputs=None, drop_updates_mask=None, state=None): inter_v = self.inter.apply(tensor.concatenate([inputs, state], axis=1)) reset_v = self.reset.apply(inter_v) update_v = self.update.apply(inter_v) reset_v = reset_v * drop_updates_mask new_state = state * (1 - reset_v) + reset_v * update_v return new_state, reset_v @application def initial_state(self, state_name, batch_size, *args, **kwargs): return tensor.repeat(self.params[0][None, :], repeats=batch_size, axis=0)
class SingleSoftmax(Initializable): def __init__(self, hidden_dim, n_classes, **kwargs): super(SingleSoftmax, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes self.mlp = MLP(activations=[Rectifier(), Softmax()], dims=[hidden_dim, hidden_dim/2, self.n_classes], weights_init=Orthogonal(), biases_init=Constant(0)) self.softmax = Softmax() self.children = [self.mlp, self.softmax] # some day: @application(...) def feedback(self, h) @application(inputs=['cs', 'y'], outputs=['cost']) def cost(self, cs, y, n_patches): energies = [self.mlp.apply(cs[:, t, :]) for t in xrange(n_patches)] cross_entropies = [self.softmax.categorical_cross_entropy(y.flatten(), energy) for energy in energies] error_rates = [T.neq(y, energy.argmax(axis=1)).mean(axis=0) for energy in energies] # train on final prediction cost = util.named(cross_entropies[-1], "cost") # monitor final prediction self.add_auxiliary_variable(cross_entropies[-1], name="cross_entropy") self.add_auxiliary_variable(error_rates[-1], name="error_rate") return cost
def test_pylearn2_training(): # Construct the model mlp = MLP(activations=[Sigmoid(), Sigmoid()], dims=[784, 100, 784], weights_init=IsotropicGaussian(), biases_init=Constant(0.01)) mlp.initialize() cost = SquaredError() # Load the data rng = numpy.random.RandomState(14) train_dataset = random_dense_design_matrix(rng, 1024, 784, 10) valid_dataset = random_dense_design_matrix(rng, 1024, 784, 10) x = tensor.matrix('features') block_cost = Pylearn2Cost(cost.apply(x, mlp.apply(x))) block_model = Pylearn2Model(mlp) # Silence Pylearn2's logger logger = logging.getLogger(pylearn2.__name__) logger.setLevel(logging.ERROR) # Training algorithm sgd = SGD(learning_rate=0.01, cost=block_cost, batch_size=128, monitoring_dataset=valid_dataset) train = Pylearn2Train(train_dataset, block_model, algorithm=sgd) train.main_loop(time_budget=3)
class AttentionReader(Initializable): def __init__(self, x_dim, dec_dim, width, height, N, **kwargs): super(AttentionReader, self).__init__(name="reader", **kwargs) self.width = width self.height = height self.N = N self.x_dim = x_dim self.dec_dim = dec_dim self.output_dim = 2 * N * N self.zoomer = ZoomableAttentionWindow(height, width, N, normalize=True) self.readout = MLP(activations=[Identity()], dims=[dec_dim, 5], **kwargs) self.children = [self.readout] @application(inputs=['x', 'x_hat', 'h_dec'], outputs=['r']) def apply(self, x, x_hat, h_dec): l = self.readout.apply(h_dec) center_y = (l[:, 0] + 1.) / 2. center_x = (l[:, 1] + 1.) / 2. log_delta = l[:, 2] log_sigma = l[:, 3] / 2. log_gamma = l[:, 4] w = self.zoomer.read(x, center_y, center_x, T.exp(log_delta), T.exp(log_sigma)) w_hat = self.zoomer.read(x_hat, center_y, center_x, T.exp(log_delta), T.exp(log_sigma)) gamma = T.exp(log_gamma).dimshuffle(0, 'x') return gamma * T.concatenate([w, w_hat], axis=1)
def main(save_to, num_batches, continue_=False): mlp = MLP([Tanh(), Identity()], [1, 10, 1], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), seed=1) mlp.initialize() x = tensor.vector('numbers') y = tensor.vector('roots') cost = SquaredError().apply(y[:, None], mlp.apply(x[:, None])) cost.name = "cost" main_loop = MainLoop( GradientDescent(cost=cost, params=ComputationGraph(cost).parameters, step_rule=Scale(learning_rate=0.001)), get_data_stream(range(100)), model=Model(cost), extensions=([LoadFromDump(save_to)] if continue_ else []) + [ Timing(), FinishAfter(after_n_batches=num_batches), DataStreamMonitoring( [cost], get_data_stream(range(100, 200)), prefix="test"), TrainingDataMonitoring([cost], after_epoch=True), Dump(save_to), Printing() ]) main_loop.run() return main_loop
def generation(z_list, n_latent, hu_decoder, n_out, y): logger.info('in generation: n_latent: %d, hu_decoder: %d', n_latent, hu_decoder) if hu_decoder == 0: return generation_simple(z_list, n_latent, n_out, y) mlp1 = MLP(activations=[Rectifier()], dims=[n_latent, hu_decoder], name='latent_to_hidDecoder') initialize([mlp1]) hid_to_out = Linear(name='hidDecoder_to_output', input_dim=hu_decoder, output_dim=n_out) initialize([hid_to_out]) mysigmoid = Logistic(name='y_hat_vae') agg_logpy_xz = 0. agg_y_hat = 0. for i, z in enumerate(z_list): y_hat = mysigmoid.apply(hid_to_out.apply( mlp1.apply(z))) #reconstructed x agg_logpy_xz += cross_entropy_loss(y_hat, y) agg_y_hat += y_hat agg_logpy_xz /= len(z_list) agg_y_hat /= len(z_list) return agg_y_hat, agg_logpy_xz
def build_mlp(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels): features = tensor.concatenate([ features_hascar, means['cp'][features_cp[:, 0]], means['dep'][features_cp[:, 1]] ], axis=1) mlp = MLP(activations=[Rectifier(), Rectifier(), None], dims=[5, 50, 50, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp') mlp.initialize() prediction = mlp.apply(features) cost = MAPECost().apply(labels, prediction) cg = ComputationGraph(cost) input_var = VariableFilter(roles=[INPUT])(cg.variables) print input_var cg_dropout1 = apply_dropout(cg, [input_var[3], input_var[5]], .4) cost_dropout1 = cg_dropout1.outputs[0] return prediction, cost_dropout1, cg_dropout1.parameters, cost
class SingleSoftmax(Initializable): def __init__(self, hidden_dim, n_classes, **kwargs): super(SingleSoftmax, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.n_classes = n_classes self.mlp = MLP(activations=[Rectifier(), Softmax()], dims=[hidden_dim, hidden_dim / 2, self.n_classes], weights_init=Orthogonal(), biases_init=Constant(0)) self.softmax = Softmax() self.children = [self.mlp, self.softmax] # some day: @application(...) def feedback(self, h) @application(inputs=['cs', 'y'], outputs=['cost']) def cost(self, cs, y, n_patches): energies = [self.mlp.apply(cs[:, t, :]) for t in xrange(n_patches)] cross_entropies = [ self.softmax.categorical_cross_entropy(y.flatten(), energy) for energy in energies ] error_rates = [ T.neq(y, energy.argmax(axis=1)).mean(axis=0) for energy in energies ] # train on final prediction cost = util.named(cross_entropies[-1], "cost") # monitor final prediction self.add_auxiliary_variable(cross_entropies[-1], name="cross_entropy") self.add_auxiliary_variable(error_rates[-1], name="error_rate") return cost
class FFMLP(Initializable): def __init__(self, config, output_layer=None, **kwargs): super(FFMLP, self).__init__(**kwargs) self.config = config self.context_embedder = ContextEmbedder(config) output_activation = [] if output_layer is None else [output_layer()] output_dim = [] if output_layer is None else [config.dim_output] self.mlp = MLP(activations=[Rectifier() for _ in config.dim_hidden] + output_activation, dims=[config.dim_input] + config.dim_hidden + output_dim) self.extremities = {'%s_k_%s' % (side, ['latitude', 'longitude'][axis]): axis for side in ['first', 'last'] for axis in [0, 1]} self.inputs = self.context_embedder.inputs + self.extremities.keys() self.children = [ self.context_embedder, self.mlp ] def _push_initialization_config(self): self.mlp.weights_init = self.config.mlp_weights_init self.mlp.biases_init = self.config.mlp_biases_init @application(outputs=['prediction']) def predict(self, **kwargs): embeddings = tuple(self.context_embedder.apply(**{k: kwargs[k] for k in self.context_embedder.inputs })) extremities = tuple((kwargs[k] - data.train_gps_mean[v]) / data.train_gps_std[v] for k, v in self.extremities.items()) inputs = tensor.concatenate(extremities + embeddings, axis=1) outputs = self.mlp.apply(inputs) return outputs @predict.property('inputs') def predict_inputs(self): return self.inputs
class GRUInitialState(GatedRecurrent): """Gated Recurrent with special initial state. Initial state of Gated Recurrent is set by an MLP that conditions on the last hidden state of the bidirectional encoder, applies an affine transformation followed by a tanh non-linearity to set initial state. """ def __init__(self, attended_dim, **kwargs): super(GRUInitialState, self).__init__(**kwargs) self.attended_dim = attended_dim self.initial_transformer = MLP(activations=[Tanh()], dims=[attended_dim, self.dim], name='state_initializer') self.children.append(self.initial_transformer) @application def initial_states(self, batch_size, *args, **kwargs): attended = kwargs['attended'] initial_state = self.initial_transformer.apply( attended[0, :, -self.attended_dim:]) return initial_state def _allocate(self): self.parameters.append(shared_floatx_nans((self.dim, self.dim), name='state_to_state')) self.parameters.append(shared_floatx_nans((self.dim, 2 * self.dim), name='state_to_gates')) for i in range(2): if self.parameters[i]: add_role(self.parameters[i], WEIGHT)
class GRU2GO(GatedRecurrent): def __init__(self, attended_dim, **kwargs): super(GRU2GO, self).__init__(**kwargs) self.attended_dim = attended_dim self.initial_transformer = MLP(activations=[Tanh()], dims=[attended_dim, self.dim], name='state_initializer') self.children.append(self.initial_transformer) @application def initial_states(self, batch_size, *args, **kwargs): attended = kwargs['attended'] initial_state = self.initial_transformer.apply( attended[0, :, -self.attended_dim:]) return initial_state def _allocate(self): self.parameters.append( shared_floatx_nans((self.dim, self.dim), name='state_to_state')) self.parameters.append( shared_floatx_nans((self.dim, 2 * self.dim), name='state_to_gates')) for i in range(2): if self.parameters[i]: add_role(self.parameters[i], WEIGHT)
def build_mlp(features_int, features_cat, labels, labels_mean): inputs = tensor.concatenate([features_int, features_cat], axis=1) mlp = MLP(activations=[Rectifier(), Rectifier(), Rectifier(), None], dims=[337, 800, 1200, 1], weights_init=IsotropicGaussian(), biases_init=Constant(1)) mlp.initialize() prediction = mlp.apply(inputs) cost = MAPECost().apply(prediction, labels, labels_mean) cg = ComputationGraph(cost) #cg_dropout0 = apply_dropout(cg, [VariableFilter(roles=[INPUT])(cg.variables)[1]], .2) cg_dropout1 = apply_dropout(cg, [ VariableFilter(roles=[OUTPUT])(cg.variables)[1], VariableFilter(roles=[OUTPUT])(cg.variables)[3], VariableFilter(roles=[OUTPUT])(cg.variables)[5] ], .2) cost_dropout1 = cg_dropout1.outputs[0] return cost_dropout1, cg_dropout1.parameters, cost #cost, cg.parameters, cost #
class topicalq_transformer(Initializable): def __init__(self, vocab_size, topical_embedding_dim, state_dim,word_num,batch_size, **kwargs): super(topicalq_transformer, self).__init__(**kwargs) self.vocab_size = vocab_size; self.word_embedding_dim = topical_embedding_dim; self.state_dim = state_dim; self.word_num=word_num; self.batch_size=batch_size; self.look_up=LookupTable(name='topical_embeddings'); self.transformer=MLP(activations=[Tanh()], dims=[self.word_embedding_dim*self.word_num, self.state_dim], name='topical_transformer'); self.children = [self.look_up,self.transformer]; def _push_allocation_config(self): self.look_up.length = self.vocab_size self.look_up.dim = self.word_embedding_dim # do we have to push_config? remain unsure @application(inputs=['source_topical_word_sequence'], outputs=['topical_embedding']) def apply(self, source_topical_word_sequence): # Time as first dimension source_topical_word_sequence=source_topical_word_sequence.T; word_topical_embeddings = self.look_up.apply(source_topical_word_sequence); word_topical_embeddings=word_topical_embeddings.swapaxes(0,1); #requires testing concatenated_topical_embeddings=tensor.reshape(word_topical_embeddings,[word_topical_embeddings.shape[0],word_topical_embeddings.shape[1]*word_topical_embeddings.shape[2]]); topical_embedding=self.transformer.apply(concatenated_topical_embeddings); return topical_embedding
def main(save_to, num_batches, continue_=False): mlp = MLP([Tanh(), Identity()], [1, 10, 1], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), seed=1) mlp.initialize() x = tensor.vector('numbers') y = tensor.vector('roots') cost = SquaredError().apply(y[:, None], mlp.apply(x[:, None])) cost.name = "cost" main_loop = MainLoop( GradientDescent( cost=cost, params=ComputationGraph(cost).parameters, step_rule=Scale(learning_rate=0.001)), get_data_stream(range(100)), model=Model(cost), extensions=([LoadFromDump(save_to)] if continue_ else []) + [Timing(), FinishAfter(after_n_batches=num_batches), DataStreamMonitoring( [cost], get_data_stream(range(100, 200)), prefix="test"), TrainingDataMonitoring([cost], after_epoch=True), Dump(save_to), Printing()]) main_loop.run() return main_loop
class GRUInitialState(GatedRecurrent): """Gated Recurrent with special initial state. Initial state of Gated Recurrent is set by an MLP that conditions on the first hidden state of the bidirectional encoder, applies an affine transformation followed by a tanh non-linearity to set initial state. """ def __init__(self, attended_dim, **kwargs): super(GRUInitialState, self).__init__(**kwargs) self.attended_dim = attended_dim self.initial_transformer = MLP(activations=[Tanh()], dims=[attended_dim, self.dim], name='state_initializer') self.children.append(self.initial_transformer) @application def initial_states(self, batch_size, *args, **kwargs): attended = kwargs['attended'] initial_state = self.initial_transformer.apply( attended[0, :, -self.attended_dim:]) return initial_state def _allocate(self): self.parameters.append( shared_floatx_nans((self.dim, self.dim), name='state_to_state')) self.parameters.append( shared_floatx_nans((self.dim, 2 * self.dim), name='state_to_gates')) for i in range(2): if self.parameters[i]: add_role(self.parameters[i], WEIGHT)
def create_model(self, x, y, input_dim, tol=10e-5): # Create the output of the MLP mlp = MLP( [Rectifier(), Rectifier(), Logistic()], [input_dim, 100, 100, 1], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() probs = mlp.apply(x) y = y.dimshuffle(0, 'x') # Create the if-else cost function true_p = (T.sum(y * probs) + tol) * 1.0 / (T.sum(y) + tol) true_n = (T.sum((1 - y) * (1 - probs)) + tol) * \ 1.0 / (T.sum(1 - y) + tol) #p = (T.sum(y) + tol) / (y.shape[0] + tol) theta = (1 - self.p) / self.p numerator = (1 + self.beta**2) * true_p denominator = self.beta**2 + theta + true_p - theta * true_n Fscore = numerator / denominator cost = -1 * Fscore cost.name = "cost" return mlp, cost, probs
def setup_model(): # shape: T x B x F input_ = T.tensor3('features') # shape: B target = T.lvector('targets') model = LSTMAttention(input_dim=10000, dim=500, mlp_hidden_dims=[2000, 500, 4], batch_size=100, image_shape=(100, 100), patch_shape=(28, 28), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) model.initialize() h, c = model.apply(input_) classifier = MLP([Rectifier(), Softmax()], [500, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) error_rate = MisclassificationRate().apply(target, probabilities) return cost, error_rate
def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5("features") tensor3 = theano.tensor.TensorType(config.floatX, (False,) * 3) locs = tensor3("locs") # shape: B x Classes target = T.ivector("targets") model = LSTMAttention(configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, alpha, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply( input_, locs ) model.location = location model.scale = scale model.alpha = location model.patch = patch classifier = MLP( [Rectifier(), Softmax()], configs["classifier_dims"], weights_init=Glorot(), biases_init=Constant(0) ) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) cost.name = "CE" error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = "ER" model.cost = cost model.error_rate = error_rate model.probabilities = probabilities if configs["load_pretrained"]: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open("VGG_CNN_params.npz") as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs["test_model"]: print "TESTING THE MODEL: CHECK THE INPUT SIZE!" cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input="ignore", allow_input_downcast=True) data = configs["get_streams"](configs["batch_size"])[0].get_epoch_iterator().next() f(data[1], data[0], data[2]) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
def construct_model(input_dim, output_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[input_dim] + hidden_dims + [2]) weights = mlp.apply(r) final = tensor.dot(x, weights) cost = Softmax().categorical_cross_entropy(y, final).mean() pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply noise cg = ComputationGraph([cost, error_rate]) noise_vars = VariableFilter(roles=[WEIGHT])(cg) apply_noise(cg, noise_vars, noise_std) [cost, error_rate] = cg.outputs return cost, error_rate
def build_mlp(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels): prediction, _, _, _, = \ build_mlp_onlyloc(features_car_cat, features_car_int, features_nocar_cat, features_nocar_int, features_cp, features_hascar, means, labels) mlp_crm = MLP(activations=[None], dims=[1, 1], weights_init=IsotropicGaussian(.1), biases_init=Constant(0), name='mlp_crm') mlp_crm.initialize() crm = features_nocar_int[:, 0][:, None] prediction = prediction * mlp_crm.apply(crm) cost = MAPECost().apply(labels, prediction) cg = ComputationGraph(cost) input_var = VariableFilter(roles=[INPUT])(cg.variables) print input_var cg_dropout = apply_dropout(cg, [input_var[7], input_var[5]], .4) cost_dropout = cg_dropout.outputs[0] return prediction, cost_dropout, cg_dropout.parameters, cost
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(tensor.flatten(x, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
class GMMMLP(Initializable): """An mlp brick that branchs out to output sigma and mu for GMM Parameters ---------- mlp: MLP brick the main mlp to wrap around. dim: output dim """ def __init__(self, mlp, dim, k, const=1e-5, **kwargs): super(GMMMLP, self).__init__(**kwargs) self.dim = dim self.const = const self.k = k input_dim = mlp.output_dim self.mu = MLP(activations=[Identity()], dims=[input_dim, dim], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[input_dim, dim], name=self.name + "_sigma") self.coeff = MLP(activations=[Identity()], dims=[input_dim, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.mlp = mlp self.children = [ self.mlp, self.mu, self.sigma, self.coeff, self.coeff2 ] #self.children.extend(self.mlp.children) @application def apply(self, inputs): state = self.mlp.apply(inputs) mu = self.mu.apply(state) sigma = self.sigma.apply(state) coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const return mu, sigma, coeff @property def output_dim(self): return self.dim
class GMMMLP(Initializable): """An mlp brick that branchs out to output sigma and mu for GMM Parameters ---------- mlp: MLP brick the main mlp to wrap around. dim: output dim """ def __init__(self, mlp, dim, k, const=1e-5, **kwargs): super(GMMMLP, self).__init__(**kwargs) self.dim = dim self.const = const self.k = k input_dim = mlp.output_dim self.mu = MLP(activations=[Identity()], dims=[input_dim, dim], name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[input_dim, dim], name=self.name + "_sigma") self.coeff = MLP(activations=[Identity()], dims=[input_dim, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.mlp = mlp self.children = [self.mlp, self.mu, self.sigma, self.coeff, self.coeff2] #self.children.extend(self.mlp.children) @application def apply(self, inputs): state = self.mlp.apply(inputs) mu = self.mu.apply(state) sigma = self.sigma.apply(state) coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const return mu, sigma, coeff @property def output_dim(self): return self.dim
class SimpleSpeechRecognizer(Initializable): """ Initializable, does nothing more than combining class DeepBidirectional and an MLP as output Parameters ---------- transition: transition of bidirectional (e.g. GatedRecurrent or LSTM) dims_transition: list of dims for RNN in bidirectional num_features: number of features or input dimensionality num_classes """ def __init__(self, transition, dims_transition, num_features, num_classes, **kwargs): super(SimpleSpeechRecognizer, self).__init__(**kwargs) # TODO: think about putting this into conf ? # Owant to use rthogonal in LSTM only for the recurrent weights (W_states), # but 1. blocks concats all 4 recurrents matrices to one. Does Orthogonal Init # know this and do the correct init? 2. peepholes (vectors/diag-mats) are initialized # with the same weight_init ..... cant init vector with Orthogonal. # For now, don't use Orthogonal. # TODO: Maybe implement LSTM by myself # self.rec_weights_init = Orthogonal(scale=1.0) self.rec_weights_init = Uniform(mean=0, width=0.01) self.ff_weights_init = Uniform(mean=0, width=0.01) self.biases_init = Constant(0.0) self.transition = transition # ************ Deep BiRNN ************* self.dblstm = DeepBidirectional( transition=self.transition, dims_hidden=dims_transition, dim_input=num_features, rec_weights_init=self.rec_weights_init, ff_weights_init=self.ff_weights_init, biases_init=self.biases_init, ) # ************ Output *************** self.output = MLP( [None], [2 * dims_transition[-1]] + [num_classes], weights_init=self.ff_weights_init, biases_init=self.biases_init, name="top", ) # Remember child bricks self.children = [self.dblstm, self.output] @application(inputs=['sequence', 'mask'], outputs=['output']) def apply(self, sequence, mask): blstm_processed = self.dblstm.apply(input_=sequence, mask=mask) return self.output.apply(blstm_processed)
def task_ID_layers(x, recurrent_in_size): mlp = MLP([Rectifier()] * (len(task_ID_FF_dims)-1), task_ID_FF_dims, name='task_ID_mlp', weights_init=Uniform(width=.2), biases_init=Constant(0)) mlp.push_initialization_config() mlp.initialize() out_size = task_ID_FF_dims[-1] + recurrent_in_size - len(game_tasks) zero_padded_task_IDs = T.concatenate([x[:,:,-len(game_tasks):], T.zeros((x.shape[0], x.shape[1], task_ID_FF_dims[0] - len(game_tasks)))], axis=2) mlp_out = mlp.apply(zero_padded_task_IDs) task_ID_out = T.concatenate([x[:,:,:-len(game_tasks)]] + [mlp_out], axis=2) return task_ID_out, out_size
def test_mlp(): x = tensor.matrix() x_val = numpy.random.rand(2, 16).astype(theano.config.floatX) mlp = MLP(activations=[Tanh(), None], dims=[16, 8, 4], weights_init=Constant(1), biases_init=Constant(1)) y = mlp.apply(x) mlp.initialize() assert_allclose( numpy.tanh(x_val.dot(numpy.ones((16, 8))) + numpy.ones((2, 8))).dot( numpy.ones((8, 4))) + numpy.ones((2, 4)), y.eval({x: x_val}), rtol=1e-06) mlp = MLP(activations=[None], weights_init=Constant(1), use_bias=False) mlp.dims = [16, 8] y = mlp.apply(x) mlp.initialize() assert_allclose(x_val.dot(numpy.ones((16, 8))), y.eval({x: x_val}), rtol=1e-06)
def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5('features') # shape: B x Classes target = T.lmatrix('targets') model = LSTMAttention( configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply(input_) classifier = MLP( [Rectifier(), Logistic()], configs['classifier_dims'], weights_init=Glorot(), biases_init=Constant(0)) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = BinaryCrossEntropy().apply(target, probabilities) cost.name = 'CE' error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = 'ER' model.cost = cost if configs['load_pretrained']: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open('VGG_CNN_params.npz') as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs['test_model']: cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input='ignore', allow_input_downcast=True) data = np.random.randn(10, 40, 3, 224, 224) targs = np.random.randn(40, 101) f(data, targs) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
def test_snapshot(): x = tensor.matrix('x') linear = MLP([Identity(), Identity()], [10, 10, 10], weights_init=Constant(1), biases_init=Constant(2)) linear.initialize() y = linear.apply(x) cg = ComputationGraph(y) snapshot = cg.get_snapshot(dict(x=numpy.zeros((1, 10), dtype=floatX))) assert len(snapshot) == 14
class SimpleSpeechRecognizer(Initializable): """ Initializable, does nothing more than combining class DeepBidirectional and an MLP as output Parameters ---------- transition: transition of bidirectional (e.g. GatedRecurrent or LSTM) dims_transition: list of dims for RNN in bidirectional num_features: number of features or input dimensionality num_classes """ def __init__(self, transition, dims_transition, num_features, num_classes, **kwargs): super(SimpleSpeechRecognizer, self).__init__(**kwargs) # TODO: think about putting this into conf ? # Owant to use rthogonal in LSTM only for the recurrent weights (W_states), # but 1. blocks concats all 4 recurrents matrices to one. Does Orthogonal Init # know this and do the correct init? 2. peepholes (vectors/diag-mats) are initialized # with the same weight_init ..... cant init vector with Orthogonal. # For now, don't use Orthogonal. # TODO: Maybe implement LSTM by myself # self.rec_weights_init = Orthogonal(scale=1.0) self.rec_weights_init = Uniform(mean=0, width=0.01) self.ff_weights_init = Uniform(mean=0, width=0.01) self.biases_init = Constant(0.0) self.transition = transition # ************ Deep BiRNN ************* self.dblstm = DeepBidirectional( transition=self.transition, dims_hidden=dims_transition, dim_input=num_features, rec_weights_init=self.rec_weights_init, ff_weights_init=self.ff_weights_init, biases_init=self.biases_init,) # ************ Output *************** self.output = MLP( [None],[2 * dims_transition[-1]]+[num_classes], weights_init=self.ff_weights_init, biases_init=self.biases_init, name="top",) # Remember child bricks self.children = [self.dblstm, self.output] @application(inputs=['sequence', 'mask'], outputs=['output']) def apply(self, sequence, mask): blstm_processed = self.dblstm.apply( input_=sequence, mask=mask) return self.output.apply(blstm_processed)
def test_mlp_apply(): x = tensor.matrix() x_val = numpy.random.rand(2, 16).astype(theano.config.floatX) mlp = MLP(activations=[Tanh().apply, None], dims=[16, 8, 4], weights_init=Constant(1), biases_init=Constant(1)) y = mlp.apply(x) mlp.initialize() assert_allclose( numpy.tanh(x_val.dot(numpy.ones((16, 8))) + numpy.ones((2, 8))).dot( numpy.ones((8, 4))) + numpy.ones((2, 4)), y.eval({x: x_val}), rtol=1e-06) mlp = MLP(activations=[None], weights_init=Constant(1), use_bias=False) mlp.dims = [16, 8] y = mlp.apply(x) mlp.initialize() assert_allclose(x_val.dot(numpy.ones((16, 8))), y.eval({x: x_val}), rtol=1e-06) assert mlp.rng == mlp.linear_transformations[0].rng
class LocatorReader(Initializable): def __init__(self, x_dim, dec_dim, channels, height, width, N, **kwargs): super(LocatorReader, self).__init__(name="reader", **kwargs) self.img_height = height self.img_width = width self.N = N self.x_dim = x_dim self.dec_dim = dec_dim self.output_dim = channels * N * N self.zoomer = ZoomableAttentionWindow(channels, height, width, N) self.readout = MLP(activations=[Identity()], dims=[dec_dim, 7], **kwargs) self.children = [self.readout] def get_dim(self, name): if name == 'input': return self.dec_dim elif name == 'x_dim': return self.x_dim elif name == 'output': return self.output_dim else: raise ValueError @application(inputs=['x', 'h_dec'], outputs=['r', 'l']) def apply(self, x, h_dec): l = self.readout.apply(h_dec) center_y, center_x, deltaY, deltaX, sigmaY, sigmaX, gamma = self.zoomer.nn2att(l) w = gamma * self.zoomer.read(x, center_y, center_x, deltaY, deltaX, sigmaY, sigmaX) return w, l @application(inputs=['h_dec'], outputs=['center_y', 'center_x', 'deltaY', 'deltaX']) def apply_l(self, h_dec): l = self.readout.apply(h_dec) center_y, center_x, deltaY, deltaX = self.zoomer.nn2att_wn(l) return center_y, center_x, deltaY, deltaX
def create_model(self): x = self.x input_dim = self.input_dim mlp = MLP([Logistic(), Logistic(), Tanh()], [input_dim, 100, 100, 1], weights_init=IsotropicGaussian(0.001), biases_init=Constant(0)) mlp.initialize() self.mlp = mlp probs = mlp.apply(x) return probs
class GaussianMLP(Initializable): """An mlp brick that branchs out to output sigma and mu for Gaussian dist Parameters ---------- mlp: MLP brick the main mlp to wrap around. dim: output dim """ def __init__(self, mlp, dim, const=0., **kwargs): super(GaussianMLP, self).__init__(**kwargs) self.dim = dim self.const = const input_dim = mlp.output_dim self.mu = MLP(activations=[Identity()], dims=[input_dim, dim], weights_init=self.weights_init, biases_init=self.biases_init, name=self.name + "_mu") self.sigma = MLP(activations=[SoftPlus()], dims=[input_dim, dim], weights_init=self.weights_init, biases_init=self.biases_init, name=self.name + "_sigma") self.mlp = mlp self.children = [self.mlp, self.mu, self.sigma] self.children.extend(self.mlp.children) @application def apply(self, inputs): state = self.mlp.apply(inputs) mu = self.mu.apply(state) sigma = self.sigma.apply(state) + self.const return mu, sigma @property def output_dim(self): return self.dim
def main(save_to, num_epochs, bokeh=False): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1 ** 2).sum() + .00005 * (W2 ** 2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), prefix="test"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if bokeh: extensions.append(Plot( 'MNIST example', channels=[ ['test_final_cost', 'test_misclassificationrate_apply_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), model=Model(cost), extensions=extensions) main_loop.run()
def test_fully_layer(): batch_size=2 x = T.tensor4(); y = T.ivector() V = 200 layer_conv = Convolutional(filter_size=(5,5),num_filters=V, name="toto", weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0)) # try with no bias activation = Rectifier() pool = MaxPooling(pooling_size=(2,2)) convnet = ConvolutionalSequence([layer_conv, activation, pool], num_channels=15, image_size=(10,10), name="conv_section") convnet.push_allocation_config() convnet.initialize() output=convnet.apply(x) batch_size=output.shape[0] output_dim=np.prod(convnet.get_dim('output')) result_conv = output.reshape((batch_size, output_dim)) mlp=MLP(activations=[Rectifier().apply], dims=[output_dim, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0)) mlp.initialize() output=mlp.apply(result_conv) cost = T.mean(Softmax().categorical_cross_entropy(y.flatten(), output)) cg = ComputationGraph(cost) W = VariableFilter(roles=[WEIGHT])(cg.variables) B = VariableFilter(roles=[BIAS])(cg.variables) W = W[0]; b = B[0] inputs_fully = VariableFilter(roles=[INPUT], bricks=[Linear])(cg) outputs_fully = VariableFilter(roles=[OUTPUT], bricks=[Linear])(cg) var_input=inputs_fully[0] var_output=outputs_fully[0] [d_W,d_S,d_b] = T.grad(cost, [W, var_output, b]) d_b = d_b.dimshuffle(('x',0)) d_p = T.concatenate([d_W, d_b], axis=0) x_value = 1e3*np.random.ranf((2,15, 10, 10)) f = theano.function([x,y], [var_input, d_S, d_p], allow_input_downcast=True, on_unused_input='ignore') A, B, C= f(x_value, [5, 0]) A = np.concatenate([A, np.ones((2,1))], axis=1) print 'A', A.shape print 'B', B.shape print 'C', C.shape print lin.norm(C - np.dot(np.transpose(A), B), 'fro') return """
def build_model(self, hidden_dim): board_input = T.vector('input') mlp = MLP(activations=[LeakyRectifier(0.1), LeakyRectifier(0.1)], dims=[9, hidden_dim, 9], weights_init=IsotropicGaussian(0.00001), biases_init=Constant(0.01)) output = mlp.apply(board_input) masked_output = Softmax().apply(output * T.eq(board_input, 0) * 1000) mlp.initialize() cost, chosen = self.get_cost(masked_output) return board_input, mlp, cost, chosen, output
def apply(self, input_, target): mlp = MLP(self.non_lins, self.dims, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name=self.name) mlp.initialize() probs = mlp.apply(T.flatten(input_, outdim=2)) probs.name = 'probs' cost = CategoricalCrossEntropy().apply(target.flatten(), probs) cost.name = "CE" self.outputs = {} self.outputs['probs'] = probs self.outputs['cost'] = cost
class LSTM2GO(LSTM): def __init__(self, attended_dim, **kwargs): super(LSTM2GO, self).__init__(**kwargs) self.attended_dim = attended_dim self.initial_transformer_s = MLP(activations=[Tanh()], dims=[attended_dim, self.dim], name='state_initializer') self.children.append(self.initial_transformer_s) self.initial_transformer_c = MLP(activations=[Tanh()], dims=[attended_dim, self.dim], name='cell_initializer') self.children.append(self.initial_transformer_c) @application def initial_states(self, batch_size, *args, **kwargs): attended = kwargs['attended'] initial_state = self.initial_transformer_s.apply( attended[0, :, -self.attended_dim:]) initial_cell = self.initial_transformer_c.apply( attended[0, :, -self.attended_dim:]) return [initial_state, initial_cell] def _allocate(self): self.W_state = shared_floatx_nans((self.dim, 4*self.dim), name='W_state') self.W_cell_to_in = shared_floatx_nans((self.dim,), name='W_cell_to_in') self.W_cell_to_forget = shared_floatx_nans((self.dim,), name='W_cell_to_forget') self.W_cell_to_out = shared_floatx_nans((self.dim,), name='W_cell_to_out') add_role(self.W_state, WEIGHT) add_role(self.W_cell_to_in, WEIGHT) add_role(self.W_cell_to_forget, WEIGHT) add_role(self.W_cell_to_out, WEIGHT) self.parameters = [ self.W_state, self.W_cell_to_in, self.W_cell_to_forget, self.W_cell_to_out]
def prior_network(x, n_input, hu_encoder, n_latent): logger.info('In prior_network: n_input: %d, hu_encoder: %d', n_input, hu_encoder) mlp1 = MLP(activations=[Rectifier()], dims=[n_input, hu_encoder], name='prior_in_to_hidEncoder') initialize([mlp1]) h_encoder = mlp1.apply(x) h_encoder = debug_print(h_encoder, 'h_encoder', False) lin1 = Linear(name='prior_hiddEncoder_to_latent_mu', input_dim=hu_encoder, output_dim=n_latent) lin2 = Linear(name='prior_hiddEncoder_to_latent_sigma', input_dim=hu_encoder, output_dim=n_latent) initialize([lin1]) initialize([lin2], rndstd=0.001) mu = lin1.apply(h_encoder) log_sigma = lin2.apply(h_encoder) return mu, log_sigma
def construct_model(input_dim, output_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') nx = x.shape[0] nj = x.shape[1] # also is r.shape[0] nr = r.shape[1] # r is nj x nr # x is nx x nj # y is nx x 1 # r_rep is nx x nj x nr r_rep = r[None, :, :].repeat(axis=0, repeats=nx) # x3 is nx x nj x 1 x3 = x[:, :, None] # concat is nx x nj x (nr + 1) concat = tensor.concatenate([r_rep, x3], axis=2) mlp_input = concat.reshape((nx * nj, nr + 1)) # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[input_dim+1] + hidden_dims + [output_dim]) activations = mlp.apply(mlp_input) act_sh = activations.reshape((nx, nj, output_dim)) final = act_sh.mean(axis=1) cost = Softmax().categorical_cross_entropy(y, final).mean() pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply noise cg = ComputationGraph([cost, error_rate]) noise_vars = VariableFilter(roles=[WEIGHT])(cg) apply_noise(cg, noise_vars, noise_std) [cost_reg, error_rate_reg] = cg.outputs return cost_reg, error_rate_reg, cost, error_rate
def create_model(): """Create the deep autoencoder model with Blocks, and load MNIST.""" mlp = MLP(activations=[Logistic(), Logistic(), Logistic(), None, Logistic(), Logistic(), Logistic(), Logistic()], dims=[784, 1000, 500, 250, 30, 250, 500, 1000, 784], weights_init=Sparse(15, IsotropicGaussian()), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') x_hat = mlp.apply(tensor.flatten(x, outdim=2)) squared_err = SquaredError().apply(tensor.flatten(x, outdim=2), x_hat) cost = BinaryCrossEntropy().apply(tensor.flatten(x, outdim=2), x_hat) return x, cost, squared_err
def setup_model(): # shape: T x B x F input_ = T.tensor3('features') # shape: B target = T.lvector('targets') model = LSTMAttention(dim=256, mlp_hidden_dims=[256, 4], batch_size=100, image_shape=(64, 64), patch_shape=(16, 16), weights_init=Glorot(), biases_init=Constant(0)) model.initialize() h, c, location, scale = model.apply(input_) classifier = MLP([Rectifier(), Softmax()], [256 * 2, 200, 10], weights_init=Glorot(), biases_init=Constant(0)) model.h = h model.c = c model.location = location model.scale = scale classifier.initialize() probabilities = classifier.apply(T.concatenate([h[-1], c[-1]], axis=1)) cost = CategoricalCrossEntropy().apply(target, probabilities) error_rate = MisclassificationRate().apply(target, probabilities) model.cost = cost location_x_0_avg = T.mean(location[0, :, 0]) location_x_0_avg.name = 'location_x_0_avg' location_x_10_avg = T.mean(location[10, :, 0]) location_x_10_avg.name = 'location_x_10_avg' location_x_20_avg = T.mean(location[-1, :, 0]) location_x_20_avg.name = 'location_x_20_avg' scale_x_0_avg = T.mean(scale[0, :, 0]) scale_x_0_avg.name = 'scale_x_0_avg' scale_x_10_avg = T.mean(scale[10, :, 0]) scale_x_10_avg.name = 'scale_x_10_avg' scale_x_20_avg = T.mean(scale[-1, :, 0]) scale_x_20_avg.name = 'scale_x_20_avg' monitorings = [error_rate, location_x_0_avg, location_x_10_avg, location_x_20_avg, scale_x_0_avg, scale_x_10_avg, scale_x_20_avg] model.monitorings = monitorings return model
def setup_model(): # shape: T x B x F input_ = T.tensor3('features') # shape: B target = T.lvector('targets') model = LSTMAttention(dim=500, mlp_hidden_dims=[400, 4], batch_size=100, image_shape=(100, 100), patch_shape=(28, 28), weights_init=Glorot(), biases_init=Constant(0)) model.initialize() h, c, location, scale = model.apply(input_) classifier = MLP([Rectifier(), Softmax()], [500, 100, 10], weights_init=Glorot(), biases_init=Constant(0)) model.h = h classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) error_rate = MisclassificationRate().apply(target, probabilities) location_x_avg = T.mean(location[:, 0]) location_x_avg.name = 'location_x_avg' location_y_avg = T.mean(location[:, 1]) location_y_avg.name = 'location_y_avg' scale_x_avg = T.mean(scale[:, 0]) scale_x_avg.name = 'scale_x_avg' scale_y_avg = T.mean(scale[:, 1]) scale_y_avg.name = 'scale_y_avg' location_x_std = T.std(location[:, 0]) location_x_std.name = 'location_x_std' location_y_std = T.std(location[:, 1]) location_y_std.name = 'location_y_std' scale_x_std = T.std(scale[:, 0]) scale_x_std.name = 'scale_x_std' scale_y_std = T.std(scale[:, 1]) scale_y_std.name = 'scale_y_std' monitorings = [error_rate, location_x_avg, location_y_avg, scale_x_avg, scale_y_avg, location_x_std, location_y_std, scale_x_std, scale_y_std] return cost, monitorings
class GRUInitialState(GatedRecurrent): def __init__(self, attended_dim, **kwargs): super(GRUInitialState, self).__init__(**kwargs) self.attended_dim = attended_dim self.initial_transformer = MLP(activations=[Tanh()], dims=[attended_dim, self.dim], name='state_initializer') self.children.append(self.initial_transformer) @application def initial_state(self, state_name, batch_size, *args, **kwargs): attended = kwargs['attended'] if state_name == 'states': initial_state = self.initial_transformer.apply( attended[0, :, -self.attended_dim:]) return initial_state return super(GRUInitialState, self).initial_state(state_name, batch_size, *args, **kwargs)