def maxout_vae_mnist_test(path_vae_mnist): # load vae model on mnist vae_mnist = load(path_vae_mnist) maxout = Maxout() x = T.matrix('features') y = T.imatrix('targets') batch_size = 128 z, _ = vae_mnist.sampler.sample(vae_mnist.encoder_mlp.apply(x)) predict = maxout.apply(z) cost = Softmax().categorical_cross_entropy(y.flatten(), predict) y_hat = Softmax().apply(predict) cost.name = 'cost' cg = ComputationGraph(cost) temp = cg.parameters for t, i in zip(temp, range(len(temp))): t.name = t.name+str(i)+"maxout" error_brick = MisclassificationRate() error_rate = error_brick.apply(y, y_hat) # training step_rule = RMSProp(0.01, 0.9) #step_rule = Momentum(0.2, 0.9) train_set = MNIST('train') test_set = MNIST("test") data_stream_train = Flatten(DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size))) data_stream_test =Flatten(DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size))) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], data_stream=data_stream_train, prefix="train") monitor_valid = DataStreamMonitoring( variables=[cost, error_rate], data_stream=data_stream_test, prefix="test") extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_epochs=50), Printing(every_n_epochs=1) ] main_loop = MainLoop(data_stream=data_stream_train, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run() # save here from blocks.serialization import dump with closing(open('../data_mnist/maxout', 'w')) as f: dump(maxout, f)
def _get_full_cost(self, input_vec, *args, **kwargs): preds = self._get_pred_dist(input_vec) cost = Softmax().categorical_cross_entropy(self.hashtag, preds).mean() max_index = preds.argmax(axis=1) cost.name = 'full_cost' ranks = tensor.argsort(preds, axis=1)[:, ::-1] top1_accuracy = tensor.eq(self.hashtag, ranks[:, 0]).mean() top10_accuracy = tensor.sum(tensor.eq(ranks[:, 0:self.rank], self.hashtag[:, None]), axis=1).mean() top1_accuracy.name = "top1_accuracy" top10_accuracy.name = "top10_accuracy" cost_drop, top1_accuracy_drop, top10_accuracy_drop = self._apply_dropout( [cost, top1_accuracy, top10_accuracy]) cost_drop.name = cost.name top1_accuracy_drop.name = top1_accuracy.name top10_accuracy_drop.name = top10_accuracy.name self.full_monitor_train_vars = [[cost_drop], [top1_accuracy_drop], [top10_accuracy_drop]] self.full_cost = cost_drop
def _get_train_cost(self, input_vec, *args, **kwargs): preds = self._get_pred_dist(input_vec) cost = Softmax().categorical_cross_entropy(self.hashtag, preds).mean() #Apply regularization cost = self._apply_reg(cost) cost.name = 'cost' ranks = tensor.argsort(preds, axis=1)[:, ::-1] top1_accuracy = tensor.eq(self.hashtag, ranks[:, 0]).mean() top10_accuracy = tensor.sum(tensor.eq(ranks[:, 0:self.rank], self.hashtag[:, None]), axis=1).mean() top1_accuracy.name = "top1_accuracy" top10_accuracy.name = "top10_accuracy" #Apply dropout cost_drop, top1_accuracy_drop, top10_accuracy_drop = self._apply_dropout( [cost, top1_accuracy, top10_accuracy]) cost_drop.name = cost.name top1_accuracy_drop.name = top1_accuracy.name top10_accuracy_drop.name = top10_accuracy.name self.monitor_train_vars = [[cost_drop], [top1_accuracy_drop], [top10_accuracy_drop]] self.cg_generator = cost_drop
data2 = data2[:, :, d1:d1+data1.shape[2], :] # max pool data2 = maxpool2.apply(data2) # activation data2 = Tanh(name='act_data2').apply(data2) # fully connected layers fc = MLP(dims=[25*50, 100, 100, num_output_classes], activations=[Rectifier(name='r1'), Rectifier(name='r2'), Identity()]) output = fc.apply(data2.reshape((data2.shape[0], 25*50))) # COST AND ERROR MEASURE cost = Softmax().categorical_cross_entropy(label, output).mean() cost.name = 'cost' error_rate = tensor.neq(tensor.argmax(output, axis=1), label).mean() error_rate.name = 'error_rate' # REGULARIZATION cg = ComputationGraph([cost, error_rate]) if weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, weight_noise) if dropout > 0: cg = apply_dropout(cg, [eeg1, eeg2, data1, data2] + VariableFilter(name='output', bricks=fc.linear_transformations[:-1])(cg), dropout) # for vfilter, p in dropout_locs: # cg = apply_dropout(cg, vfilter(cg), p) [cost_reg, error_rate_reg] = cg.outputs
def build_model_soft(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] # Build the MLP dims = [2 * state_dim] activations = [] for i in range(args.mlp_layers): activations.append(Rectifier()) dims.append(state_dim) # Activation of the last layer of the MLP if args.mlp_activation == "logistic": activations.append(Logistic()) elif args.mlp_activation == "rectifier": activations.append(Rectifier()) elif args.mlp_activation == "hard_logistic": activations.append(HardLogistic()) else: assert False # Output of MLP has dimension 1 dims.append(1) for i in range(layers - 1): mlp = MLP(activations=activations, dims=dims, weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( SoftGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear( input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have: # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...] # Extract gate_values gate_values = h[2::2] new_h = [h[0]] new_h.extend(h[1::2]) h = new_h # Now we have: # h = [state, state_1, state_2, ...] # gate_values = [gate_value_1, gate_value_2, gate_value_3] for i, gate_value in enumerate(gate_values): gate_value.name = "gate_value_" + str(i) # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
def main(save_to, cost_name, learning_rate, momentum, num_epochs): mlp = MLP([None], [784, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') scores = mlp.apply(x) batch_size = y.shape[0] indices = tensor.arange(y.shape[0]) target_scores = tensor.set_subtensor( tensor.zeros((batch_size, 10))[indices, y.flatten()], 1) score_diff = scores - target_scores # Logistic Regression if cost_name == 'lr': cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean() # MSE elif cost_name == 'mse': cost = (score_diff**2).mean() # Perceptron elif cost_name == 'perceptron': cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean() # TLE elif cost_name == 'minmin': cost = abs(score_diff[indices, y.flatten()]).mean() cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean() # TLEcut elif cost_name == 'minmin_cut': # Score of the groundtruth should be greater or equal than its target score cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean() # Score of the prediction should be less or equal than its actual score cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean() # TLE2 elif cost_name == 'minmin2': cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()])**2).mean() cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)])**2).mean() # Direct loss minimization elif cost_name == 'direct': epsilon = 0.1 cost = (-scores[indices, (scores + epsilon * target_scores).argmax(axis=1)] + scores[indices, scores.argmax(axis=1)]).mean() cost /= epsilon elif cost_name == 'svm': cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] - scores[indices, y.flatten()]).mean() else: raise ValueError("Unknown cost " + cost) error_rate = MisclassificationRate().apply(y.flatten(), scores) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) cost.name = 'cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) if learning_rate == None: learning_rate = 0.0001 if momentum == None: momentum = 0.0 rule = Momentum(learning_rate=learning_rate, momentum=momentum) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=rule) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), # CallbackExtension( # lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9), # after_epoch=True), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[['test_cost', 'test_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run() df = pandas.DataFrame.from_dict(main_loop.log, orient='index') res = { 'cost': cost_name, 'learning_rate': learning_rate, 'momentum': momentum, 'train_cost': df.train_cost.iloc[-1], 'test_cost': df.test_cost.iloc[-1], 'best_test_cost': df.test_cost.min(), 'train_error': df.train_error_rate.iloc[-1], 'test_error': df.test_error_rate.iloc[-1], 'best_test_error': df.test_error_rate.min() } res = { k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items() } json.dump(res, sys.stdout) sys.stdout.flush()
def maxout_mnist_test(): # if it is working # do a class x = T.tensor4('features') y = T.imatrix('targets') batch_size = 128 # maxout convolutional layers # layer0 filter_size = (8, 8) activation = Maxout_(num_pieces=2).apply pooling_size = 4 pooling_step = 2 pad = 0 image_size = (28, 28) num_channels = 1 num_filters = 48 layer0 = ConvolutionalLayer(activation, filter_size, num_filters, pooling_size=(pooling_size, pooling_size), pooling_step=(pooling_step, pooling_step), pad=pad, image_size=image_size, num_channels=num_channels, weights_init=Uniform(width=0.01), biases_init=Uniform(width=0.01), name="layer_0") layer0.initialize() num_filters = 48 filter_size = (8,8) pooling_size = 4 pooling_step = 2 pad = 3 image_size = (layer0.get_dim('output')[1], layer0.get_dim('output')[2]) num_channels = layer0.get_dim('output')[0] layer1 = ConvolutionalLayer(activation, filter_size, num_filters, pooling_size=(pooling_size, pooling_size), pooling_step=(pooling_step, pooling_step), pad=pad, image_size=image_size, num_channels=num_channels, weights_init=Uniform(width=0.01), biases_init=Uniform(width=0.01), name="layer_1") layer1.initialize() num_filters = 24 filter_size=(5,5) pooling_size = 2 pooling_step = 2 pad = 3 activation = Maxout_(num_pieces=4).apply image_size = (layer1.get_dim('output')[1], layer1.get_dim('output')[2]) num_channels = layer1.get_dim('output')[0] layer2 = ConvolutionalLayer(activation, filter_size, num_filters, pooling_size=(pooling_size, pooling_size), pooling_step=(pooling_step, pooling_step), pad = pad, image_size=image_size, num_channels=num_channels, weights_init=Uniform(width=0.01), biases_init=Uniform(width=0.01), name="layer_2") layer2.initialize() conv_layers = [layer0, layer1, layer2] output_conv = x for layer in conv_layers : output_conv = layer.apply(output_conv) output_conv = Flattener().apply(output_conv) mlp_layer = Linear(54, 10, weights_init=Uniform(width=0.01), biases_init=Uniform(width=0.01), name="layer_5") mlp_layer.initialize() output_mlp = mlp_layer.apply(output_conv) params, names = build_params(conv_layers, [mlp_layer]) cost = Softmax().categorical_cross_entropy(y.flatten(), output_mlp) cost.name = 'cost' cg_ = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT])(cg_.variables) cost = cost + 0.001*sum([sum(p**2) for p in weights]) cg = ComputationGraph(cost) error_rate = errors(output_mlp, y) error_rate.name = 'error' # training step_rule = RMSProp(0.01, 0.9) #step_rule = Momentum(0.2, 0.9) train_set = MNIST('train') test_set = MNIST("test") data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)) data_stream_monitoring = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)) data_stream_test =DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size)) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) monitor_train = DataStreamMonitoring( variables=[cost, error_rate], data_stream=data_stream_monitoring, prefix="train") monitor_valid = DataStreamMonitoring( variables=[cost, error_rate], data_stream=data_stream_test, prefix="test") extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_epochs=50), Printing(every_n_epochs=1) ] main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run() from blocks.serialization import dump with closing(open('../data_mnist/maxout', 'w')) as f: dump(vae, f)
def build_model_hard(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())] for i in range(layers - 1): mlp = MLP(activations=[Logistic()], dims=[2 * state_dim, 1], weights_init=initialization.IsotropicGaussian(0.1), biases_init=initialization.Constant(0), name="mlp_" + str(i)) transitions.append( HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh())) rnn = RecurrentStack(transitions, skip_connections=skip_connections) # dim = layers * state_dim output_layer = Linear(input_dim=layers * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs' + suffix] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # Now we have correctly: # h = [state_1, state_2, state_3 ...] # Save all the last states last_states = {} for d in range(layers): last_states[d] = h[d][-1, :, :] # Concatenate all the states if layers > 1: h = tensor.concatenate(h, axis=2) h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
features = Flattener().apply(convnet.apply(x)) mlp = MLP( activations=[Rectifier(), None], dims=[output_dim, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0) ) mlp.initialize() y_hat = mlp.apply(features) # numerically stable softmax cost = Softmax().categorical_cross_entropy(y.flatten(), y_hat) cost.name = 'nll' error_rate = MisclassificationRate().apply(y.flatten(), y_hat) #cost = MisclassificationRate().apply(y, y_hat) #cost.name = 'error_rate' cg = ComputationGraph(cost) #pdb.set_trace() weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables) l2_regularization = 0.005 * sum((W**2).sum() for W in weights) cost_l2 = cost + l2_regularization cost.name = 'cost_with_regularization' # Print sizes to check print("Representation sizes:")
def test_communication(path_vae_mnist, path_maxout_mnist): # load models vae_mnist = load(path_vae_mnist) # get params : to be remove from the computation graph # write an object maxout classifier = Maxout() # get params : to be removed from the computation graph # vae whose prior is a zero mean unit variance normal distribution activation = Rectifier() full_weights_init = Orthogonal() weights_init = full_weights_init # SVHN en niveau de gris layers = [32*32, 200, 200, 200, 50] encoder_layers = layers[:-1] encoder_mlp = MLP([activation] * (len(encoder_layers)-1), encoder_layers, name="MLP_SVHN_encode", biases_init=Constant(0.), weights_init=weights_init) enc_dim = encoder_layers[-1] z_dim = layers[-1] sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init) decoder_layers = layers[:] ## includes z_dim as first layer decoder_layers.reverse() decoder_mlp = MLP([activation] * (len(decoder_layers)-2) + [Rectifier()], decoder_layers, name="MLP_SVHN_decode", biases_init=Constant(0.), weights_init=weights_init) vae_svhn = VAEModel(encoder_mlp, sampler, decoder_mlp) vae_svhn.initialize() # do the connection x = T.tensor4('x') # SVHN samples preprocessed with local contrast normalization x_ = (T.sum(x, axis=1)).flatten(ndim=2) y = T.imatrix('y') batch_size = 512 svhn_z, _ = vae_svhn.sampler.sample(vae_svhn.encoder_mlp.apply(x_)) mnist_decode = vae_mnist.decoder_mlp.apply(svhn_z) # reshape shape = mnist_decode.shape mnist_decode = mnist_decode.reshape((shape[0], 1, 28, 28)) prediction = classifier.apply(mnist_decode) y_hat = Softmax().apply(prediction) x_recons, kl_terms = vae_svhn.reconstruct(x_) recons_term = BinaryCrossEntropy().apply(x_, T.clip(x_recons, 1e-4, 1 - 1e-4)) recons_term.name = "recons_term" cost_A = recons_term + kl_terms.mean() cost_A.name = "cost_A" cost_B = Softmax().categorical_cross_entropy(y.flatten(), prediction) cost_B.name = 'cost_B' cost = cost_B cost.name = "cost" cg = ComputationGraph(cost) # probably discard some of the parameters parameters = cg.parameters params = [] for t in parameters: if not re.match(".*mnist", t.name): params.append(t) """ f = theano.function([x], cost_A) value_x = np.random.ranf((1, 3, 32, 32)).astype("float32") print f(value_x) return """ error_brick = MisclassificationRate() error_rate = error_brick.apply(y.flatten(), y_hat) error_rate.name = "error_rate" # training here step_rule = RMSProp(0.001,0.99) dataset_hdf5_file="/Tmp/ducoffem/SVHN/" train_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='train') test_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='valid') data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(2000, batch_size)) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], prefix="train", every_n_batches=10) monitor_valid = DataStreamMonitoring( variables=[cost, error_rate], data_stream=data_stream_test, prefix="valid", every_n_batches=10) # drawing_samples = ImagesSamplesSave("../data_svhn", vae, (3, 32, 32), every_n_epochs=1) extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_batches=10000), Printing(every_n_batches=10) ] main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run()
def __init__(self): inp = tensor.lmatrix('bytes') # Make state vars state_vars = {} for i, d in enumerate(hidden_dims): state_vars['states%d'%i] = theano.shared(numpy.zeros((num_seqs, d)) .astype(theano.config.floatX), name='states%d'%i) state_vars['cells%d'%i] = theano.shared(numpy.zeros((num_seqs, d)) .astype(theano.config.floatX), name='cells%d'%i) # Construct brick cchlstm = CCHLSTM(io_dim=io_dim, hidden_dims=hidden_dims, cond_cert=cond_cert, activation=activation_function) # Random pass passdict = {} for i, p in enumerate(block_prob): passdict['pass%d'%i] = rng.binomial(size=(inp.shape[1], inp.shape[0]), p=1-p) # Apply it outs = cchlstm.apply(inputs=inp.dimshuffle(1, 0), **dict(state_vars.items() + passdict.items())) states = [] active_prop = [] for i in range(len(hidden_dims)): states.append((state_vars['states%d'%i], outs[3*i+1][-1, :, :])) states.append((state_vars['cells%d'%i], outs[3*i+2][-1, :, :])) active_prop.append(outs[3*i+3].mean()) active_prop[-1].name = 'active_prop_%d'%i out = outs[0].dimshuffle(1, 0, 2) # Do prediction and calculate cost pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(), out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1), io_dim))) error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean() # Initialize all bricks for brick in [cchlstm]: brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() # Apply noise and dropoutvars cg = ComputationGraph([cost, error_rate]) if w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, w_noise_std) [cost_reg, error_rate_reg] = cg.outputs self.sgd_cost = cost_reg self.monitor_vars = [[cost, cost_reg], [error_rate, error_rate_reg], active_prop] cost.name = 'cost' cost_reg.name = 'cost_reg' error_rate.name = 'error_rate' error_rate_reg.name = 'error_rate_reg' self.out = out self.pred = pred self.states = states
def main(): # # # # # # # # # # # # Modeling Building # # # # # # # # # # # # # ConvOp requires input be a 4D tensor x = tensor.tensor4("features") y = tensor.ivector("targets") # Convolutional Layers # ==================== # "Improving neural networks by preventing co-adaptation of feature detectors" # conv_layers = [ # # ConvolutionalLayer(activiation, filter_size, num_filters, pooling_size, name) # ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l1') # , ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l2') # , ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l3') # ] # "VGGNet" conv_layers = [ ConvolutionalActivation(Rectifier().apply, (3,3), 64, border_mode='full', name='l1') , ConvolutionalLayer(Rectifier().apply, (3,3), 64, (2,2), border_mode='full', name='l2') , ConvolutionalActivation(Rectifier().apply, (3,3), 128, border_mode='full', name='l3') , ConvolutionalLayer(Rectifier().apply, (3,3), 128, (2,2), border_mode='full', name='l4') , ConvolutionalActivation(Rectifier().apply, (3,3), 256, border_mode='full', name='l5') , ConvolutionalLayer(Rectifier().apply, (3,3), 256, (2,2), border_mode='full', name='l6') ] # Bake my own # conv_layers = [ # # ConvolutionalLayer(activiation, filter_size, num_filters, pooling_size, name) # ConvolutionalLayer(Rectifier().apply, (5,5), 64, (2,2), border_mode='full', name='l1') # , ConvolutionalLayer(Rectifier().apply, (3,3), 128, (2,2), border_mode='full', name='l2') # , ConvolutionalActivation(Rectifier().apply, (3,3), 256, border_mode='full', name='l3') # , ConvolutionalLayer(Rectifier().apply, (3,3), 256, (2,2), border_mode='full', name='l4') # ] convnet = ConvolutionalSequence( conv_layers, num_channels=3, image_size=(32,32), weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) convnet.initialize() output_dim = np.prod(convnet.get_dim('output')) # Fully Connected Layers # ====================== conv_features = convnet.apply(x) features = Flattener().apply(conv_features) mlp = MLP( activations=[Rectifier()]*2+[None] , dims=[output_dim, 256, 256, 10] , weights_init=IsotropicGaussian(0.01) , biases_init=Constant(0) ) mlp.initialize() y_hat = mlp.apply(features) # print y_hat.shape.eval({x: np.zeros((1, 3, 32, 32), dtype=theano.config.floatX)}) # Numerically Stable Softmax cost = Softmax().categorical_cross_entropy(y, y_hat) error_rate = MisclassificationRate().apply(y, y_hat) cg = ComputationGraph(cost) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables) l2_regularization = 0.005 * sum((W**2).sum() for W in weights) cost = cost + l2_regularization cost.name = 'cost_with_regularization' # Print sizes to check print("Representation sizes:") for layer in convnet.layers: print(layer.get_dim('input_')) # # # # # # # # # # # # Modeling Training # # # # # # # # # # # # # Figure out data source train = CIFAR10("train") test = CIFAR10("test") # Load Data Using Fuel train_stream = DataStream.default_stream( dataset=train , iteration_scheme=SequentialScheme(train.num_examples, batch_size=128)) test_stream = DataStream.default_stream( dataset=test , iteration_scheme=SequentialScheme(test.num_examples, batch_size=1024)) # Train algorithm = GradientDescent( cost=cost , params=cg.parameters , step_rule=Adam(learning_rate=0.0005) ) main_loop = MainLoop( model=Model(cost) , data_stream=train_stream , algorithm=algorithm , extensions=[ TrainingDataMonitoring( [cost, error_rate] , prefix='train' , after_epoch=True) , DataStreamMonitoring( [cost, error_rate] , test_stream, prefix='test') , ExperimentSaver(dest_directory='...', src_directory='.') , Printing() , ProgressBar() ] ) main_loop.run()
def main(save_to, cost_name, learning_rate, momentum, num_epochs): mlp = MLP([None], [784, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') scores = mlp.apply(x) batch_size = y.shape[0] indices = tensor.arange(y.shape[0]) target_scores = tensor.set_subtensor( tensor.zeros((batch_size, 10))[indices, y.flatten()], 1) score_diff = scores - target_scores # Logistic Regression if cost_name == 'lr': cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean() # MSE elif cost_name == 'mse': cost = (score_diff ** 2).mean() # Perceptron elif cost_name == 'perceptron': cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean() # TLE elif cost_name == 'minmin': cost = abs(score_diff[indices, y.flatten()]).mean() cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean() # TLEcut elif cost_name == 'minmin_cut': # Score of the groundtruth should be greater or equal than its target score cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean() # Score of the prediction should be less or equal than its actual score cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean() # TLE2 elif cost_name == 'minmin2': cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()]) ** 2).mean() cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)]) ** 2).mean() # Direct loss minimization elif cost_name == 'direct': epsilon = 0.1 cost = (- scores[indices, (scores + epsilon * target_scores).argmax(axis=1)] + scores[indices, scores.argmax(axis=1)]).mean() cost /= epsilon elif cost_name == 'svm': cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] - scores[indices, y.flatten()]).mean() else: raise ValueError("Unknown cost " + cost) error_rate = MisclassificationRate().apply(y.flatten(), scores) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) cost.name = 'cost' mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) if learning_rate == None: learning_rate = 0.0001 if momentum == None: momentum = 0.0 rule = Momentum(learning_rate=learning_rate, momentum=momentum) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=rule) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), # CallbackExtension( # lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9), # after_epoch=True), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_cost', 'test_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run() df = pandas.DataFrame.from_dict(main_loop.log, orient='index') res = {'cost' : cost_name, 'learning_rate' : learning_rate, 'momentum' : momentum, 'train_cost' : df.train_cost.iloc[-1], 'test_cost' : df.test_cost.iloc[-1], 'best_test_cost' : df.test_cost.min(), 'train_error' : df.train_error_rate.iloc[-1], 'test_error' : df.test_error_rate.iloc[-1], 'best_test_error' : df.test_error_rate.min()} res = {k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items()} json.dump(res, sys.stdout) sys.stdout.flush()
def build_model_vanilla(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(state_dim) lookup = LookupTable(length=vocab_size, dim=state_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence( [lookup.apply])) transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh()) for _ in range(layers)] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear( input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared( numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) kwargs['states' + suffix] = init_states[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # We have # h = [state, state_1, state_2 ...] if layers > 1 # h = state if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer last_states = {} if layers > 1: # Save all the last states for d in range(layers): last_states[d] = h[d][-1, :, :] if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: last_states[0] = h[-1, :, :] h.name = "hidden_state" # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates
print(output_dim) # Fully connected layers features = Flattener().apply(convnet.apply(x)) mlp = MLP(activations=[Rectifier(), None], dims=[output_dim, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() y_hat = mlp.apply(features) # numerically stable softmax cost = Softmax().categorical_cross_entropy(y.flatten(), y_hat) cost.name = 'nll' error_rate = MisclassificationRate().apply(y.flatten(), y_hat) #cost = MisclassificationRate().apply(y, y_hat) #cost.name = 'error_rate' cg = ComputationGraph(cost) #pdb.set_trace() weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables) l2_regularization = 0.005 * sum((W**2).sum() for W in weights) cost_l2 = cost + l2_regularization cost.name = 'cost_with_regularization' # Print sizes to check print("Representation sizes:")
def training_model_mnist(learning_rate, momentum, iteration, batch_size, epoch_end, iter_batch): x = T.tensor4('features') y = T.imatrix('targets') classifier = build_model_mnist() predict = classifier.apply(x) y_hat = Softmax().apply(predict) cost = Softmax().categorical_cross_entropy(y.flatten(), predict) cost.name = "cost" cg = ComputationGraph(cost) error_brick = MisclassificationRate() error_rate = error_brick.apply(y.flatten(), y_hat) error_rate.name = "error" train_set = MNIST(('train', )) test_set = MNIST(("test",)) if iteration =="slice": data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme_slice(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme_slice(test_set.num_examples, batch_size)) else: data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size)) step_rule = Momentum(learning_rate=learning_rate, momentum=momentum) start = time.clock() time_spent = shared_floatx(np.float32(0.), name="time_spent") time_extension = Time_reference(start, time_spent, every_n_batches=1) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], prefix="train", every_n_epochs=iter_batch) monitor_valid = DataStreamMonitoring( variables=[cost, error_rate, time_spent], data_stream=data_stream_test, prefix="valid", every_n_epochs=iter_batch) # add a monitor variable about the time extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_epochs=epoch_end), Printing(every_n_epochs=iter_batch), time_extension ] main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run()
def build_model_lstm(vocab_size, args, dtype=floatX): logger.info('Building model ...') # Parameters for the model context = args.context state_dim = args.state_dim layers = args.layers skip_connections = args.skip_connections virtual_dim = 4 * state_dim # Symbolic variables # In both cases: Time X Batch x = tensor.lmatrix('features') y = tensor.lmatrix('targets') # Build the model output_names = [] output_dims = [] for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if d == 0 or skip_connections: output_names.append("inputs" + suffix) output_dims.append(virtual_dim) lookup = LookupTable(length=vocab_size, dim=virtual_dim) lookup.weights_init = initialization.IsotropicGaussian(0.1) lookup.biases_init = initialization.Constant(0) # Make sure time_length is what we need fork = Fork(output_names=output_names, input_dim=args.mini_batch_size, output_dims=output_dims, prototype=FeedforwardSequence([lookup.apply])) transitions = [ LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers) ] rnn = RecurrentStack(transitions, skip_connections=skip_connections) # If skip_connections: dim = layers * state_dim # else: dim = state_dim output_layer = Linear(input_dim=skip_connections * layers * state_dim + (1 - skip_connections) * state_dim, output_dim=vocab_size, name="output_layer") # Return list of 3D Tensor, one for each layer # (Time X Batch X embedding_dim) pre_rnn = fork.apply(x) # Give a name to the input of each layer if skip_connections: for t in range(len(pre_rnn)): pre_rnn[t].name = "pre_rnn_" + str(t) else: pre_rnn.name = "pre_rnn" # Prepare inputs for the RNN kwargs = OrderedDict() init_states = {} init_cells = {} for d in range(layers): if d > 0: suffix = '_' + str(d) else: suffix = '' if skip_connections: kwargs['inputs' + suffix] = pre_rnn[d] elif d == 0: kwargs['inputs'] = pre_rnn init_states[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='state0_%d' % d) init_cells[d] = theano.shared(numpy.zeros( (args.mini_batch_size, state_dim)).astype(floatX), name='cell0_%d' % d) kwargs['states' + suffix] = init_states[d] kwargs['cells' + suffix] = init_cells[d] # Apply the RNN to the inputs h = rnn.apply(low_memory=True, **kwargs) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] last_states = {} last_cells = {} for d in range(layers): last_states[d] = h[5 * d][-1, :, :] last_cells[d] = h[5 * d + 1][-1, :, :] # The updates of the hidden states updates = [] for d in range(layers): updates.append((init_states[d], last_states[d])) updates.append((init_cells[d], last_states[d])) # h = [state, cell, in, forget, out, state_1, # cell_1, in_1, forget_1, out_1 ...] # Extract the values in_gates = h[2::5] forget_gates = h[3::5] out_gates = h[4::5] gate_values = { "in_gates": in_gates, "forget_gates": forget_gates, "out_gates": out_gates } h = h[::5] # Now we have correctly: # h = [state, state_1, state_2 ...] if layers > 1 # h = [state] if layers == 1 # If we have skip connections, concatenate all the states # Else only consider the state of the highest layer if layers > 1: if skip_connections: h = tensor.concatenate(h, axis=2) else: h = h[-1] else: h = h[0] h.name = "hidden_state" presoft = output_layer.apply(h[context:, :, :]) # Define the cost # Compute the probability distribution time, batch, feat = presoft.shape presoft.name = 'presoft' cross_entropy = Softmax().categorical_cross_entropy( y[context:, :].flatten(), presoft.reshape((batch * time, feat))) cross_entropy = cross_entropy / tensor.log(2) cross_entropy.name = "cross_entropy" # TODO: add regularisation for the cost # the log(1) is here in order to differentiate the two variables # for monitoring cost = cross_entropy + tensor.log(1) cost.name = "regularized_cost" # Initialize the model logger.info('Initializing...') fork.initialize() # Dont initialize as Orthogonal if we are about to load new parameters if args.load_path is not None: rnn.weights_init = initialization.Constant(0) else: rnn.weights_init = initialization.Orthogonal() rnn.biases_init = initialization.Constant(0) rnn.initialize() output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() return cost, cross_entropy, updates, gate_values
def __init__(self, config): inp = tensor.imatrix('bytes') embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX), name='embedding_matrix') in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim)) in_repr.name = 'in_repr' bricks = [] states = [] # Construct predictive GRU hierarchy hidden = [] costs = [] next_target = in_repr.dimshuffle(1, 0, 2) for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims, config.cost_factors, config.hidden_q)): init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX), name='st0_%d'%i) linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim, name="lstm_in_%d"%i) lstm = GatedRecurrent(dim=hdim, activation=config.activation_function, name="lstm_rec_%d"%i) linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i) tanh = Tanh('lstm_out_tanh_%d'%i) bricks += [linear, lstm, linear2, tanh] if i > 0: linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim, name='lstm_in2_%d'%i) bricks += [linear1] next_target = tensor.cast(next_target, dtype=theano.config.floatX) inter = linear.apply(theano.gradient.disconnected_grad(next_target)) if i > 0: inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:])) new_hidden = lstm.apply(inputs=inter[:,:,:hdim], gate_inputs=inter[:,:,hdim:], states=init_state) states.append((init_state, new_hidden[-1, :, :])) hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)] pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:])) costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()] costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()] diff = next_target - pred next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5) # Construct output from hidden states hidden = [s.dimshuffle(1, 0, 2) for s in hidden] out_parts = [] out_dims = config.out_hidden + [config.io_dim] for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)): pred_linear = Linear(input_dim=dim, output_dim=out_dims[0], name='pred_linear_%d'%i) bricks.append(pred_linear) lin = theano.gradient.disconnected_grad(state) out_parts.append(pred_linear.apply(lin)) # Do prediction and calculate cost out = sum(out_parts) if len(out_dims) > 1: out = config.out_hidden_act[0](name='out_act0').apply(out) mlp = MLP(dims=out_dims, activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])] +[Identity()], name='out_mlp') bricks.append(mlp) out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1)) ).reshape((inp.shape[0],inp.shape[1]+1,-1)) pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp.flatten(), out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1], config.io_dim))).mean() error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean() sgd_cost = cost + sum(costs) # Initialize all bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize() # apply noise cg = ComputationGraph([sgd_cost, cost, error_rate]+costs) if config.weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.weight_noise) sgd_cost = cg.outputs[0] cost = cg.outputs[1] error_rate = cg.outputs[2] costs = cg.outputs[3:] # put stuff into self that is usefull for training or extensions self.sgd_cost = sgd_cost sgd_cost.name = 'sgd_cost' for i in range(len(costs)): costs[i].name = 'pred_cost_%d'%i cost.name = 'cost' error_rate.name = 'error_rate' self.monitor_vars = [costs, [cost], [error_rate]] self.out = out[:,1:,:] self.pred = pred[:,1:] self.states = states
def __init__(self, config): inp = tensor.imatrix('bytes') in_onehot = tensor.eq(tensor.arange(config.io_dim, dtype='int32').reshape((1, 1, config.io_dim)), inp[:, :, None]).astype(theano.config.floatX) in_onehot.name = 'in_onehot' hidden_dim = sum(p['dim'] for p in config.layers) recvalues = tensor.concatenate([in_onehot.dimshuffle(1, 0, 2), tensor.zeros((inp.shape[1], inp.shape[0], hidden_dim))], axis=2) # Construct hidden states indim = config.io_dim bricks = [] states = [] for i in xrange(1, len(config.layers)+1): p = config.layers[i-1] init_state = theano.shared(numpy.zeros((config.num_seqs, p['dim'])).astype(theano.config.floatX), name='st0_%d'%i) init_cell = theano.shared(numpy.zeros((config.num_seqs, p['dim'])).astype(theano.config.floatX), name='cell0_%d'%i) linear = Linear(input_dim=indim, output_dim=4*p['dim'], name="lstm_in_%d"%i) bricks.append(linear) inter = linear.apply(recvalues[:, :, :indim]) lstm = RstLSTM(dim=p['dim'], activation=config.activation_function, name="lstm_rec_%d"%i) bricks.append(lstm) run_mask = None if 'run_on' in p: run_mask = compare_matrix(inp.T, p['run_on']) rst_in_mask = None if 'reset_before' in p: rst_in_mask = compare_matrix(inp.T, p['reset_before']) rst_out_mask = None if 'reset_after' in p: rst_out_mask = compare_matrix(inp.T, p['reset_after']) new_hidden, new_cells, rec_out = \ lstm.apply_cond(inputs=inter, states=init_state, cells=init_cell, run_mask=run_mask, rst_in_mask=rst_in_mask, rst_out_mask=rst_out_mask) states.append((init_state, new_hidden[-1, :, :])) states.append((init_cell, new_cells[-1, :, :])) indim2 = indim + p['dim'] recvalues = tensor.set_subtensor(recvalues[:, :, indim:indim2], rec_out) indim = indim2 print "**** recvalues", recvalues.dtype for i, (u, v) in enumerate(states): print "**** state", i, u.dtype, v.dtype recvalues = recvalues.dimshuffle(1, 0, 2) # Construct output from hidden states top_linear = Linear(input_dim=indim, output_dim=config.io_dim, name="top_linear") bricks.append(top_linear) out = top_linear.apply(recvalues) out.name = 'out' # Do prediction and calculate cost pred = out.argmax(axis=2).astype('int32') print "**** inp", inp.dtype print "**** out", out.dtype print "**** pred", pred.dtype cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(), out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1), config.io_dim))).mean() cost.name = 'cost' error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).astype(theano.config.floatX).mean() print "**** cost", cost.dtype print "**** error_rate", error_rate.dtype # Initialize all bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize() # Apply noise and dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise_std) if config.i_dropout > 0: cg = apply_dropout(cg, hidden[1:], config.i_dropout) [cost_reg, error_rate_reg] = cg.outputs print "**** cost_reg", cost_reg.dtype print "**** error_rate_reg", error_rate_reg.dtype # add l1 regularization if config.l1_reg > 0: l1pen = sum(abs(st).mean() for st in hidden[1:]) cost_reg = cost_reg + config.l1_reg * l1pen if config.l1_reg_weight > 0: l1pen_w = sum(abs(w).mean() for w in VariableFilter(roles=[WEIGHT])(cg)) cost_reg = cost_reg + config.l1_reg_weight * l1pen_w cost_reg += 1e-10 # so that it is not the same Theano variable as cost error_rate_reg += 1e-10 # put stuff into self that is usefull for training or extensions self.sgd_cost = cost_reg cost.name = 'cost' cost_reg.name = 'cost_reg' error_rate.name = 'error_rate' error_rate_reg.name = 'error_rate_reg' self.monitor_vars = [[cost], [cost_reg], [error_rate_reg]] self.out = out self.pred = pred self.states = states