def apply(self, input_lb, input_un, target): batch_size = input_lb.shape[0] get_labeled = lambda x: x[:batch_size] if x is not None else x input = T.concatenate([input_lb, input_un], axis=0) self.layer_dims = {0: self.input_dim} self.lr = self.shared(self.default_lr, "learning_rate", role=None) top = len(self.layers) - 1 clean = self.encoder(input, noise_std=[0]) corr = self.encoder(input, noise_std=self.noise_std) ests, costs = self.decoder(clean, corr, batch_size) # Costs y = target.flatten() costs.class_clean = CategoricalCrossEntropy().apply(y, get_labeled(clean.h[top])) costs.class_clean.name = "CE_clean" costs.class_corr = CategoricalCrossEntropy().apply(y, get_labeled(corr.h[top])) costs.class_corr.name = "CE_corr" costs.total = costs.class_corr * 1.0 for i in range(len(self.layers)): costs.total += costs.denois[i] * self.denoising_cost_x[i] costs.total.name = "Total_cost" self.costs = costs # Classification error mr = MisclassificationRate() self.error = mr.apply(y, get_labeled(clean.h[top])) * np.float32(100.0) self.error.name = "Error_rate"
def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5("features") tensor3 = theano.tensor.TensorType(config.floatX, (False,) * 3) locs = tensor3("locs") # shape: B x Classes target = T.ivector("targets") model = LSTMAttention(configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, alpha, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply( input_, locs ) model.location = location model.scale = scale model.alpha = location model.patch = patch classifier = MLP( [Rectifier(), Softmax()], configs["classifier_dims"], weights_init=Glorot(), biases_init=Constant(0) ) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) cost.name = "CE" error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = "ER" model.cost = cost model.error_rate = error_rate model.probabilities = probabilities if configs["load_pretrained"]: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open("VGG_CNN_params.npz") as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs["test_model"]: print "TESTING THE MODEL: CHECK THE INPUT SIZE!" cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input="ignore", allow_input_downcast=True) data = configs["get_streams"](configs["batch_size"])[0].get_epoch_iterator().next() f(data[1], data[0], data[2]) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
def maxout_vae_mnist_test(path_vae_mnist): # load vae model on mnist vae_mnist = load(path_vae_mnist) maxout = Maxout() x = T.matrix('features') y = T.imatrix('targets') batch_size = 128 z, _ = vae_mnist.sampler.sample(vae_mnist.encoder_mlp.apply(x)) predict = maxout.apply(z) cost = Softmax().categorical_cross_entropy(y.flatten(), predict) y_hat = Softmax().apply(predict) cost.name = 'cost' cg = ComputationGraph(cost) temp = cg.parameters for t, i in zip(temp, range(len(temp))): t.name = t.name+str(i)+"maxout" error_brick = MisclassificationRate() error_rate = error_brick.apply(y, y_hat) # training step_rule = RMSProp(0.01, 0.9) #step_rule = Momentum(0.2, 0.9) train_set = MNIST('train') test_set = MNIST("test") data_stream_train = Flatten(DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size))) data_stream_test =Flatten(DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size))) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], data_stream=data_stream_train, prefix="train") monitor_valid = DataStreamMonitoring( variables=[cost, error_rate], data_stream=data_stream_test, prefix="test") extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_epochs=50), Printing(every_n_epochs=1) ] main_loop = MainLoop(data_stream=data_stream_train, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run() # save here from blocks.serialization import dump with closing(open('../data_mnist/maxout', 'w')) as f: dump(maxout, f)
def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5('features') # shape: B x Classes target = T.lmatrix('targets') model = LSTMAttention( configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply(input_) classifier = MLP( [Rectifier(), Logistic()], configs['classifier_dims'], weights_init=Glorot(), biases_init=Constant(0)) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = BinaryCrossEntropy().apply(target, probabilities) cost.name = 'CE' error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = 'ER' model.cost = cost if configs['load_pretrained']: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open('VGG_CNN_params.npz') as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs['test_model']: cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input='ignore', allow_input_downcast=True) data = np.random.randn(10, 40, 3, 224, 224) targs = np.random.randn(40, 101) f(data, targs) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
def test_misclassification_rate(): y = tensor.vector(dtype="int32") yhat = tensor.matrix(theano.config.floatX) top1_brick = MisclassificationRate() top2_brick = MisclassificationRate(top_k=2) top3_brick = MisclassificationRate(top_k=3) f = theano.function([y, yhat], [top1_brick.apply(y, yhat), top2_brick.apply(y, yhat), top3_brick.apply(y, yhat)]) y_ = numpy.array([2, 1, 0, 1, 2], dtype="int32") yhat_ = numpy.array([[3, 2, 1, 0], [1, 8, 2, 1], [3, 8, 1, 2], [1, 6, 4, 2], [9, 7, 5, 5]], dtype="float32") top1_error = 0.6 top2_error = 0.4 top3_error = 0.2 assert_allclose([top1_error, top2_error, top3_error], f(y_, yhat_))
def apply(self, input_labeled, target_labeled, input_unlabeled): self.layer_counter = 0 self.layer_dims = {0: self.input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) top = len(self.layers) - 1 num_labeled = input_labeled.shape[0] self.join = lambda l, u: T.concatenate([l, u], axis=0) self.labeled = lambda x: x[:num_labeled] if x is not None else x self.unlabeled = lambda x: x[num_labeled:] if x is not None else x self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) input_concat = self.join(input_labeled, input_unlabeled) clean = self.encoder(input_concat, 'clean', input_noise_std=0.0, noise_std=[]) corr = self.encoder(input_concat, 'corr', input_noise_std=self.super_noise_std, noise_std=self.f_local_noise_std) est, costs = self.decoder(clean, corr) # Costs y = target_labeled.flatten() costs.class_clean = CategoricalCrossEntropy().apply( y, clean.labeled.h[top]) costs.class_clean.name = 'CE_clean' costs.class_corr = CategoricalCrossEntropy().apply( y, corr.labeled.h[top]) costs.class_corr.name = 'CE_corr' costs.total = costs.class_corr * 1.0 for i in range(len(self.layers)): costs.total += costs.denois[i] * self.denoising_cost_x[i] costs.total.name = 'Total_cost' self.costs = costs # Classification error mr = MisclassificationRate() self.error = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.error.name = 'Error_rate'
def main(name, epochs, batch_size, learning_rate, window_size, conv_sizes, num_filters, fc_dim, enc_dim, dec_dim, step, num_digits, num_classes, oldmodel, live_plotting): channels, img_height, img_width = 1, 100, 100 rnninits = { 'weights_init': Uniform(width=0.02), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } rec_inits = { 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } convinits = { 'weights_init': Uniform(width=.2), 'biases_init': Constant(0.), } n_iter = step * num_digits filter_size1, filter_size2 = zip(conv_sizes, conv_sizes)[:] w_height, w_width = window_size.split(',') w_height = int(w_height) w_width = int(w_width) subdir = time.strftime("%Y-%m-%d") + "-" + name if not os.path.exists(subdir): os.makedirs(subdir) lines = ["\n Running experiment", " subdirectory: %s" % subdir, " learning rate: %g" % learning_rate, " attention size: %s" % window_size, " n_iterations: %d" % n_iter, " encoder dimension: %d" % enc_dim, " decoder dimension: %d" % dec_dim, " batch size: %d" % batch_size, " epochs: %d" % epochs, ] for line in lines: print(line) print() rectifier = Rectifier() conv1 = Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 2), num_channels=channels, image_size=(w_height, w_width), border_mode='half', name='conv1', **convinits) conv1_bn = SpatialBatchNormalization(input_dim=(64, 26, 26), conserve_memory=False, n_iter=n_iter, name='conv1_bn') conv2 = Convolutional(filter_size=filter_size2, num_channels=int(num_filters / 2), num_filters=int(num_filters / 2), image_size=(26, 26), name='conv2', **convinits) conv2_bn = SpatialBatchNormalization(input_dim=(64, 24, 24), conserve_memory=False, n_iter=n_iter, name='conv2_bn') max_pooling = MaxPooling(pooling_size=(2, 2), step=(2, 2)) conv3 = Convolutional(filter_size=filter_size2, num_filters=num_filters, num_channels=int(num_filters / 2), image_size=(12, 12), border_mode='half', name='conv3', **convinits) conv3_bn = SpatialBatchNormalization(input_dim=(128, 12, 12), conserve_memory=False, n_iter=n_iter, name='conv3_bn') conv4 = Convolutional(filter_size=filter_size2, num_filters=num_filters, num_channels=num_filters, image_size=(12, 12), border_mode='half', name='conv4', **convinits) conv4_bn = SpatialBatchNormalization(input_dim=(128, 12, 12), conserve_memory=False, n_iter=n_iter, name='conv4_bn') # Max Pooling conv5 = Convolutional(filter_size=filter_size2, num_filters=160, num_channels=num_filters, image_size=(6, 6), border_mode='half', name='conv5', **convinits) conv5_bn = SpatialBatchNormalization(input_dim=(160, 6, 6), conserve_memory=False, n_iter=n_iter, name='conv5_bn') conv6 = Convolutional(filter_size=filter_size2, num_filters=192, num_channels=160, image_size=(6, 6), name='conv6', **convinits) conv6_bn = SpatialBatchNormalization(input_dim=(192, 4, 4), conserve_memory=False, n_iter=n_iter, name='conv6_bn') conv_mlp = MLP(activations=[Identity()], dims=[3072, fc_dim], name="MLP_conv", **inits) conv_mlp_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='conv_mlp_bn') loc_mlp = MLP(activations=[Identity()], dims=[6, fc_dim], name="MLP_loc", **inits) loc_mlp_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='loc_mlp_bn') encoder_mlp = MLP([Identity()], [fc_dim, 4 * enc_dim], name="MLP_enc", **rec_inits) decoder_mlp = MLP([Identity()], [enc_dim, 4 * dec_dim], name="MLP_dec", **rec_inits) encoder_rnn = LSTM(activation=Tanh(), dim=enc_dim, name="RNN_enc", **rnninits) conv_init = ConvolutionalSequence( [Convolutional(filter_size=filter_size1, num_filters=int(num_filters / 8), name='conv1_init'), SpatialBatchNormalization(conserve_memory=False, name='conv1_bn_init'), Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 8), name='conv2_init'), SpatialBatchNormalization(conserve_memory=False, name='conv2_bn_init'), Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 4), name='conv3_init'), SpatialBatchNormalization(conserve_memory=False, name='conv3_bn_init'), ], image_size=(12, 12), num_channels=channels, name='conv_seq_init', **convinits) decoder_rnn = LSTM(activation=Tanh(), dim=dec_dim, name="RNN_dec", **rnninits) emit_mlp = MLP(activations=[Tanh()], dims=[dec_dim, 6], name='emit_mlp', weights_init=Constant(0.), biases_init=Constant((1., 0., 0., 0., 1., 0.))) classification_mlp1 = MLP(activations=[Identity()], dims=[enc_dim, fc_dim], name='MPL_class1', **inits) classification_mlp1_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='classification_mlp1_bn') classification_mlp2 = MLP(activations=[Identity()], dims=[fc_dim, fc_dim], name='MPL_class2', **inits) classification_mlp2_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='classification_mlp2_bn') classification_mlp3 = MLP(activations=[Softmax()], dims=[fc_dim, num_classes], name='MPL_class3', **inits) edram = EDRAM(channels=channels, out_height=w_height, out_width=w_width, n_iter=n_iter, num_classes=num_classes, rectifier=rectifier, conv1=conv1, conv1_bn=conv1_bn, conv2=conv2, conv2_bn=conv2_bn, max_pooling=max_pooling, conv3=conv3, conv3_bn=conv3_bn, conv4=conv4, conv4_bn=conv4_bn, conv5=conv5, conv5_bn=conv5_bn, conv6=conv6, conv6_bn=conv6_bn, conv_mlp=conv_mlp, conv_mlp_bn=conv_mlp_bn, loc_mlp=loc_mlp, loc_mlp_bn=loc_mlp_bn, conv_init=conv_init, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, classification_mlp1=classification_mlp1, classification_mlp1_bn=classification_mlp1_bn, classification_mlp2=classification_mlp2, classification_mlp2_bn=classification_mlp2_bn, classification_mlp3=classification_mlp3, emit_mlp=emit_mlp) edram.initialize() # ------------------------------------------------------------------------ x = T.ftensor4('features') x_coarse = T.ftensor4('features_coarse') y = T.ivector('labels') wr = T.fmatrix('locations') with batch_normalization(edram): bn_p, bn_l, m_c1_bn, s_c1_bn, m_c2_bn, s_c2_bn, m_c3_bn, s_c3_bn, m_c4_bn, s_c4_bn, m_c5_bn, s_c5_bn, m_c6_bn, s_c6_bn, \ m_c_bn, s_c_bn, m_l_bn, s_l_bn, m_cl1_bn, s_cl1_bn, m_cl2_bn, s_cl2_bn = edram.calculate_train(x, x_coarse) def compute_cost(p, wr, y, l): cost_where = T.dot(T.sqr(wr - l), [1, 0.5, 1, 0.5, 1, 1]) cost_y = T.stack([T.nnet.categorical_crossentropy(T.maximum(p[i, :], 1e-7), y) for i in range(0, n_iter)]) return cost_where, cost_y cost_where, cost_y = compute_cost(bn_p, wr, y, bn_l) bn_cost = cost_y + cost_where bn_cost = bn_cost.sum(axis=0) bn_cost = bn_cost.mean() bn_cost.name = 'cost' bn_error_rate = MisclassificationRate().apply(y, bn_p[-1]) bn_error_rate.name = 'error_rate' # ------------------------------------------------------------ bn_cg = ComputationGraph([bn_cost, bn_error_rate]) # Prepare algorithm algorithm = GradientDescent( cost=bn_cg.outputs[0], on_unused_sources='ignore', parameters=bn_cg.parameters, step_rule=CompositeRule([ RemoveNotFinite(), StepClipping(10.), Adam(learning_rate) ]) ) pop_updates = get_batch_normalization_updates(bn_cg) update_params = [conv1_bn.population_mean, conv1_bn.population_stdev, conv2_bn.population_mean, conv2_bn.population_stdev, conv3_bn.population_mean, conv3_bn.population_stdev, conv4_bn.population_mean, conv4_bn.population_stdev, conv5_bn.population_mean, conv5_bn.population_stdev, conv6_bn.population_mean, conv6_bn.population_stdev, conv_mlp_bn.population_mean, conv_mlp_bn.population_stdev, loc_mlp_bn.population_mean, loc_mlp_bn.population_stdev, classification_mlp1_bn.population_mean, classification_mlp1_bn.population_stdev, classification_mlp2_bn.population_mean, classification_mlp2_bn.population_stdev] update_values = [m_c1_bn, s_c1_bn, m_c2_bn, s_c2_bn, m_c3_bn, s_c3_bn, m_c4_bn, s_c4_bn, m_c5_bn, s_c5_bn, m_c6_bn, s_c6_bn, m_c_bn, s_c_bn, m_l_bn, s_l_bn, m_cl1_bn, s_cl1_bn, m_cl2_bn, s_cl2_bn] pop_updates.extend([(p, m) for p, m in zip(update_params, update_values)]) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] algorithm.add_updates(extra_updates) # ------------------------------------------------------------------------ # Setup monitors p, l = edram.calculate_test(x, x_coarse) cost_where, cost_y = compute_cost(p, wr, y, l) cost = cost_y + cost_where cost = cost.sum(axis=0) cost = cost.mean() cost.name = 'cost' error_rate = MisclassificationRate().apply(y, p[-1]) error_rate.name = 'error_rate' monitors = [cost, error_rate] plotting_extensions = [] # Live plotting... if live_plotting: plot_channels = [ ['train_cost', 'test_cost'], ['train_error_rate', 'test_error_rate'], ] plotting_extensions = [ Plot(subdir, channels=plot_channels, server_url='http://155.69.150.60:80/') ] # ------------------------------------------------------------ mnist_cluttered_train = MNISTCluttered(which_sets=['train'], sources=('features', 'locations', 'labels')) mnist_cluttered_test = MNISTCluttered(which_sets=['test'], sources=('features', 'locations', 'labels')) main_loop = MainLoop( model=Model([bn_cost]), data_stream=DataStream.default_stream(mnist_cluttered_train, iteration_scheme=ShuffledScheme(mnist_cluttered_train.num_examples, batch_size)), algorithm=algorithm, extensions=[Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring(monitors, DataStream.default_stream(mnist_cluttered_train, iteration_scheme=SequentialScheme(mnist_cluttered_train.num_examples, batch_size)), prefix='train'), DataStreamMonitoring(monitors, DataStream.default_stream(mnist_cluttered_test, iteration_scheme=SequentialScheme(mnist_cluttered_test.num_examples, batch_size)), prefix="test"), PartsOnlyCheckpoint("{}/{}".format(subdir, name), before_training=False, after_epoch=True, save_separately=['log', ]), TrackTheBest('test_error_rate', 'best_test_error_rate'), BestCheckpount("{}/{}".format(subdir, name), 'best_test_error_rate', save_separately=['model', ]), Printing(), ProgressBar(), PrintingTo("\n".join(lines), "{}/{}_log.txt".format(subdir, name)), ] + plotting_extensions) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_parameter_values(oldmodel.get_parameter_values()) main_loop.model.get_top_bricks()[0].conv1_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv1_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv1_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv1_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv2_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv2_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv2_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv2_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv3_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv3_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv3_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv3_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv4_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv4_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv4_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv4_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv5_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv5_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv5_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv5_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv6_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv6_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv6_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv6_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].loc_mlp_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].loc_mlp_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].loc_mlp_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].loc_mlp_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_mlp_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv_mlp_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_mlp_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv_mlp_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp1_bn.population_mean.set_value( oldmodel.get_top_bricks()[0].classification_mlp1_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp1_bn.population_stdev.set_value( oldmodel.get_top_bricks()[0].classification_mlp1_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp2_bn.population_mean.set_value( oldmodel.get_top_bricks()[0].classification_mlp2_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp2_bn.population_stdev.set_value( oldmodel.get_top_bricks()[0].classification_mlp2_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[1].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[1].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[1].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[1].population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[3].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[3].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[3].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[3].population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[5].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[5].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[5].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[5].population_stdev.get_value()) del oldmodel main_loop.run()
biases_init=IsotropicGaussian(), prototype=input_mlp, ) parallel_nets.initialize() l_h, r_h = parallel_nets.apply(l_x=l_x, r_x=r_x) # Concatenate the inputs from the two hidden subnets into a single variable # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate: # Also apply the outputs from the net, and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # Need to define the computation graph: graph = ComputationGraph(cost) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(graph.variables) # Add some regularization to this model: lam = 0.001 cost += lam * l2_norm(W) cost.name = 'entropy' # This is the model without dropout, but with l2 reg. model = Model(cost)
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) hidden_units = int(config.get('hyperparams', 'hidden_units', 16)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') fine_tune = config.getboolean('hyperparams', 'fine_tune') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13' ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45' right_dim = 10519 left_dim = 11427 train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') l_x = tensor.matrix('l_features') r_x = tensor.matrix('r_features') y = tensor.lmatrix('targets') lnet = load(ln_file).model.get_top_bricks()[0] rnet = load(rn_file).model.get_top_bricks()[0] # Pre-trained layers: # Inputs -> hidden_1 -> hidden 2 for side, net in zip(['l', 'r'], [lnet, rnet]): for child in net.children: child.name = side + '_' + child.name ll1 = lnet.children[0] lr1 = lnet.children[1] ll2 = lnet.children[2] lr2 = lnet.children[3] rl1 = rnet.children[0] rr1 = rnet.children[1] rl2 = rnet.children[2] rr2 = rnet.children[3] l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x)))) r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x)))) input_dim = ll2.output_dim + rl2.output_dim # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output output_mlp = MLP(activations=[ Rectifier(name='h3'), Rectifier(name='h4'), Softmax(name='output'), ], dims=[ input_dim, hidden_units, hidden_units, 2, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(std=W_sd, mean=W_mu)) output_mlp.initialize() # # Concatenate the inputs from the two hidden subnets into a single variable # # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) # y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # If no fine-tuning of l-r models is wanted, find the params for only # the joint layers: if fine_tune: params_to_update = dropout_graph.parameters else: params_to_update = VariableFilter([PARAMETER], bricks=output_mlp.children)(cost_graph) # Learning Algorithm: algo = GradientDescent( step_rule=solver_type, params=params_to_update, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_LeftRight', channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], ) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}'.format(stamp), save_separately=['model', 'log'], every_n_epochs=1) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), FinishIfNoImprovementAfter(notification_name='validation_error', epochs=1), Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
from blocks.bricks import Linear, Logistic, Softmax # In[10]: hidden_layer_size = 100 input_to_hidden = Linear(name='input_to_hidden', input_dim=117, output_dim=hidden_layer_size) h = Logistic().apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_layer_size, output_dim=2) y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix('targets') from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate cost = CategoricalCrossEntropy().apply(y, y_hat) error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) error_rate.name = "error_rate" # >>> from blocks.roles import WEIGHT from blocks.graph import ComputationGraph # >>> from blocks.filter import VariableFilter cg = ComputationGraph(cost) # >>> W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) # >>> cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() # >>> cost.name = 'cost_with_regularization' cost.name = 'cost_simple_xentropy' from blocks.initialization import IsotropicGaussian, Constant input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize()
def training_model_mnist(learning_rate, momentum, iteration, batch_size, epoch_end, iter_batch): x = T.tensor4('features') y = T.imatrix('targets') classifier = build_model_mnist() predict = classifier.apply(x) y_hat = Softmax().apply(predict) cost = Softmax().categorical_cross_entropy(y.flatten(), predict) cost.name = "cost" cg = ComputationGraph(cost) error_brick = MisclassificationRate() error_rate = error_brick.apply(y.flatten(), y_hat) error_rate.name = "error" train_set = MNIST(('train', )) test_set = MNIST(("test",)) if iteration =="slice": data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme_slice(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme_slice(test_set.num_examples, batch_size)) else: data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size)) step_rule = Momentum(learning_rate=learning_rate, momentum=momentum) start = time.clock() time_spent = shared_floatx(np.float32(0.), name="time_spent") time_extension = Time_reference(start, time_spent, every_n_batches=1) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], prefix="train", every_n_epochs=iter_batch) monitor_valid = DataStreamMonitoring( variables=[cost, error_rate, time_spent], data_stream=data_stream_test, prefix="valid", every_n_epochs=iter_batch) # add a monitor variable about the time extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_epochs=epoch_end), Printing(every_n_epochs=iter_batch), time_extension ] main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run()
def main(save_to, num_epochs, subset=None, num_batches=None, batch_size=None, regularization=None, annealing=None, histogram=None, resume=False): output_size = 10 convnet = create_noisy_all_conv_net(batch_size, True) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) test_components = (ComponentwiseCrossEntropy().apply( y.flatten(), probs).copy(name='components')) test_error_rate = (MisclassificationRate().apply( y.flatten(), probs).copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), probs).copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate, test_components]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg train_cg = test_cg train_cost, train_error_rate, train_components = (test_cost, test_error_rate, test_components) # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) test_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) test_nit_rate = tensor.concatenate([n.flatten() for n in test_nits]).mean() test_nit_rate.name = 'nit_rate' train_nit_rate = test_nit_rate l2_norm = sum([(W**2).sum() for W in weights]) l2_norm.name = 'l2_norm' l2_regularization = 0.0001 * l2_norm l2_regularization.name = 'l2_regularization' # test_cost = test_cost + l2_regularization # test_cost.name = 'cost_with_regularization' mean_log_sigma = tensor.concatenate([n.flatten() for n in logsigma]).mean() mean_log_sigma.name = 'log_sigma' # Training version of cost nit_penalty = theano.shared( numpy.asarray(regularization, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' nit_regularization = nit_penalty * train_nit_rate nit_regularization.name = 'nit_regularization' train_cost = train_cost + nit_regularization + l2_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train", )) cifar10_train_stream = RandomPadCropFlip(NormalizeBatchLevels( DataStream.default_stream(cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features', )), (32, 32), pad=5, which_sources=('features', )) # cifar10_train_stream = NormalizeBatchLevels(DataStream.default_stream( # cifar10_train, iteration_scheme=ShuffledScheme( # cifar10_train.num_examples, batch_size)), # which_sources=('features',)) cifar10_test = CIFAR10(("test", )) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme(cifar10_test.num_examples, batch_size)), which_sources=('features', )) momentum = Momentum(0.002, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) step_rule = CompositeRule([StepClipping(10), momentum]) # Train with simple SGD algorithm = GradientDescent(cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) add_noise = NoiseExtension(noise_parameters=noise_parameters) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), add_noise, FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [(1, 0.005), (3, 0.01), (5, 0.02), (200, 0.002), (250, 0.0002), (300, 0.00002)]), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_nit_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring([ train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_norm, nit_penalty, l2_regularization, nit_regularization, mean_log_sigma, momentum.learning_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", every_n_batches=100, after_epoch=True), Plot('Training performance for ' + save_to, channels=[ [ 'train_cost_with_regularization', 'train_cost_without_regularization', 'train_l2_regularization', 'train_nit_regularization' ], ['train_error_rate'], ['train_total_gradient_norm'], ], every_n_batches=100), Plot('Test performance for ' + save_to, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] if annealing: extensions.append(EpochExponentiation(nit_penalty, 1 - annealing)) if histogram: attribution = AttributionExtension(components=train_components, parameters=trainable_parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(train_cost) main_loop = MainLoop(algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def main(feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=None, num_batches=None): if feature_maps is None: feature_maps = [32, 48, 64, 80, 128, 128] if mlp_hiddens is None: mlp_hiddens = [1000] if conv_sizes is None: conv_sizes = [7, 5, 5, 5, 5, 4] if pool_sizes is None: pool_sizes = [3, 2, 2, 2, 2, 1] if batch_size is None: batch_size = 64 conv_steps=[2, 1, 1, 1, 1, 1] #same as stride image_size = (256, 256) output_size = 2 learningRate = 0.001 drop_prob = 0.5 weight_noise = 0.75 num_epochs = 250 num_batches = None host_plot='http://*****:*****@ %s' % (graph_name, datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) PLOT_AVAILABLE = True except ImportError: PLOT_AVAILABLE = False extensions.append(Checkpoint(save_to, after_epoch=True, after_training=True, save_separately=['log'])) logger.info("Building the model") model = Model(cost) ########### Loading images ##################### main_loop = MainLoop( algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def training_committee_member(instance, learning_rate, train, batch_size, valid, valid_full=1.): valid_full+=0.1 x = T.tensor4('x') y = T.imatrix('y') y_prev = instance.apply(x) #cost = CategoricalCrossEntropy().apply(y.flatten(), y_prev).mean() cost = Softmax().categorical_cross_entropy(y.flatten(), y_prev).mean() error = MisclassificationRate().apply(y.flatten(), Softmax().apply(y_prev)).mean() # take only the last parameters to avoid having the same members among the committee W, B = get_Params(y_prev) # take all the parameters for now !!!! params=W[:2]+B[:2] """ w = W[0]; layer_number=None for w_tmp in W: if layer_number is None: (_, layer_number, _) = analyze_param_name(w.name) (_, layer_number_tmp, _) = analyze_param_name(w_tmp.name) if layer_number_tmp > layer_number: w = w_tmp layer_number=layer_number_tmp for b_tmp in B: (_, layer_number_tmp, _) = analyze_param_name(b_tmp.name) if layer_number == layer_number_tmp: b = b_tmp break params = [w,b] """ #updates, _ = Adam(cost, params, learning_rate) updates,_ = RMSProp(cost, params, learning_rate, decay_rate=0.9) #updates, _ = Sgd(cost, params, learning_rate) train_function = theano.function([x,y], cost, updates=updates, allow_input_downcast=True) test_function = theano.function([x,y], cost, allow_input_downcast=True) error_function = theano.function([x,y], error, allow_input_downcast=True) x_train, y_train = train x_valid, y_valid = valid n_train = len(y_train)/batch_size stop=False init_increment = 5 increment = init_increment error_cost=[]; train_cost=[] n_valid = len(y_valid)/batch_size for minibatch in range(n_valid): x_value = x_valid[minibatch*batch_size:(minibatch+1)*batch_size] y_value = y_valid[minibatch*batch_size:(minibatch+1)*batch_size] error_cost.append(error_function(x_value, y_value)) train_cost.append(test_function(x_value, y_value)) before = np.mean(error_cost) if before <=valid_full: stop=True best_score = np.mean(train_cost) best_error = before while not stop: train_cost=[]; error_cost=[] for minibatch in range(n_train): x_value = x_train[minibatch*batch_size:(minibatch+1)*batch_size] y_value = y_train[minibatch*batch_size:(minibatch+1)*batch_size] train_function(x_value, y_value) if minibatch ==10: if increment !=0: for minibatch in range(n_valid): x_value = x_valid[minibatch*batch_size:(minibatch+1)*batch_size] y_value = y_valid[minibatch*batch_size:(minibatch+1)*batch_size] train_cost.append(test_function(x_value, y_value)) error_cost.append(error_function(x_value, y_value)) #print np.mean(train_cost)*100 error = np.mean(error_cost) score = np.mean(train_cost) if error <= valid_full: #print 'A' stop=True break elif score < best_score*0.995: best_score = score best_error = error increment = init_increment #print (best_score, np.mean(error_cost)*100) else: #print 'hihi', increment increment-=1 #print (best_score, score, np.mean(error_cost)*100) else: stop=True break # evaluation validation ! """ error_cost=[] n_valid = len(y_valid)/batch_size for minibatch in range(n_valid): x_value = x_valid[minibatch*batch_size:(minibatch+1)*batch_size] y_value = y_valid[minibatch*batch_size:(minibatch+1)*batch_size] error_cost.append(error_function(x_value, y_value)) """ # new_round with more precision if np.mean(error_cost) <=valid_full or learning_rate <=1e-5: print (before, best_error*100) print '#####' return instance else: #print 'restart' return training_committee_member(instance, learning_rate*0.1, train, batch_size, valid, valid_full)
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=200, num_batches=None): if feature_maps is None: feature_maps = [32, 32, 64, 64, 128, 128] if mlp_hiddens is None: mlp_hiddens = [1000] if conv_sizes is None: conv_sizes = [7, 5, 5, 5, 3, 3] if pool_sizes is None: pool_sizes = [2, 2, 2, 2, 2, 2] image_size = (128, 128) batch_size = 64 output_size = 2 learningRate = 0.01 drop_prob = 0.4 weight_noise = 0.75 num_epochs = 150 num_batches = None # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) # Save on csv # numpy.save(probs) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) error_rate2 = error_rate.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables) ############# Dropout ############# logger.info('Applying dropout') cg = apply_dropout(cg, weights[0:3], drop_prob) dropped_out = VariableFilter(roles=[DROPOUT])(cg.variables) ############# Guaussian Noise ############# logger.info('Applying Gaussian noise') cg = apply_noise(cg, weights, weight_noise) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme( data.num_examples, batch_size)) stream = MinimumImageDimensions(stream, image_size, which_sources=('image_features', )) stream = MaximumImageDimensions(stream, image_size, which_sources=('image_features', )) stream = RandomHorizontalSwap(stream, which_sources=('image_features', )) stream = Random2DRotation(stream, which_sources=('image_features', )) #stream = ScikitResize(stream, image_size, which_sources=('image_features',)) stream = ScaleAndShift(stream, 1. / 255, 0, which_sources=('image_features', )) stream = Cast(stream, dtype='float32', which_sources=('image_features', )) return stream stream_data_train = create_data( DogsVsCats(('train', ), subset=slice(0, 22500))) stream_data_test = create_data( DogsVsCats(('train', ), subset=slice(22500, 25000))) #stream_data_train = create_data(DogsVsCats(('train',), subset=slice(0, 10))) #stream_data_test = create_data(DogsVsCats(('train',), subset=slice(10, 12))) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum(learning_rate=learningRate, momentum=0.7)) #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Scale(learning_rate=learningRate)) #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Adam(0.001)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [] extensions.append(Timing()) extensions.append( FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches)) extensions.append( DataStreamMonitoring([cost, error_rate], stream_data_test, prefix="valid")) extensions.append( TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True)) extensions.append( Checkpoint("Model1_uniform_init.pkl", after_epoch=True, after_training=True, save_separately=['log'])) extensions.append(ProgressBar()) extensions.append(Printing()) host_plot = 'http://hades.calculquebec.ca:5090' extensions.append( Plot('5C 3*3C 2*2P 204080...F 004LR 09Mom %s %s @ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) logger.info("Building the model") model = Model(cost) ########### Loading images ##################### main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def error(self, x, y): y_pred = Softmax().apply(self.apply(x)) return MisclassificationRate().apply(y.flatten(), y_pred).mean()
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) hidden_units = int(config.get('hyperparams', 'hidden_units', 16)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') fine_tune = config.getboolean('hyperparams', 'fine_tune') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13' ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45' right_dim = 10519 left_dim = 11427 train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') l_x = tensor.matrix('l_features') r_x = tensor.matrix('r_features') y = tensor.lmatrix('targets') lnet = load(ln_file).model.get_top_bricks()[0] rnet = load(rn_file).model.get_top_bricks()[0] # Pre-trained layers: # Inputs -> hidden_1 -> hidden 2 for side, net in zip(['l', 'r'], [lnet, rnet]): for child in net.children: child.name = side + '_' + child.name ll1 = lnet.children[0] lr1 = lnet.children[1] ll2 = lnet.children[2] lr2 = lnet.children[3] rl1 = rnet.children[0] rr1 = rnet.children[1] rl2 = rnet.children[2] rr2 = rnet.children[3] l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x)))) r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x)))) input_dim = ll2.output_dim + rl2.output_dim # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output output_mlp = MLP(activations=[ Rectifier(name='h3'), Rectifier(name='h4'), Softmax(name='output'), ], dims=[ input_dim, hidden_units, hidden_units, 2, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(std=W_sd, mean=W_mu)) output_mlp.initialize() # # Concatenate the inputs from the two hidden subnets into a single variable # # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) # y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # If no fine-tuning of l-r models is wanted, find the params for only # the joint layers: if fine_tune: params_to_update = dropout_graph.parameters else: params_to_update = VariableFilter( [PARAMETER], bricks=output_mlp.children)(cost_graph) # Learning Algorithm: algo = GradientDescent(step_rule=solver_type, params=params_to_update, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot( 'AdniNet_LeftRight', channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], ) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}'.format(stamp), save_separately=['model', 'log'], every_n_epochs=1) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), FinishIfNoImprovementAfter( notification_name='validation_error', epochs=1), Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def main(job_id, params, config_file='params.ec'): config = ConfigParser.ConfigParser() config.readfp(open('./configs/{}'.format(config_file))) pr = pprint.PrettyPrinter(indent=4) pr.pprint(config) net_name = config.get('hyperparams', 'net_name', 'adni') struct_name = net_name.split('_')[0] max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float(config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') input_dim = input_dims[struct_name] # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[ input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent( step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('{}_{}'.format(net_name, side), channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}/{}/{}'.format(struct_name, side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit: # And by that I mean if the means of the val error and training error over the # previous 'epochs' is greater than the 'threshold', we are overfitting. early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.05, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def train_net(net, train_stream, test_stream, L1=None, L2=None, early_stopping=False, finish=None, dropout=False, jobid=None, update=None, duration=None, **ignored): x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') y_hat = net.apply(x) #Cost cost_before = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost_before.name = "cost_without_regularization" #Error #Taken from brodesf error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = "Misclassification rate" #Regularization cg = ComputationGraph(cost_before) WS = VariableFilter(roles=[WEIGHT])(cg.variables) if dropout: print("Dropout") cg = apply_dropout(cg, WS, 0.5) if L1: print("L1 with lambda ", L1) L1_reg = L1 * sum([abs(W).sum() for W in WS]) L1_reg.name = "L1 regularization" cost_before += L1_reg if L2: print("L2 with lambda ", L2) L2_reg = L2 * sum([(W**2).sum() for W in WS]) L2_reg.name = "L2 regularization" cost_before += L2_reg cost = cost_before cost.name = 'cost_with_regularization' #Initialization print("Initilization") net.initialize() #Algorithm step_rule = Scale(learning_rate=0.1) if update is not None: if update == "rmsprop": print("Using RMSProp") step_rule = RMSProp() remove_not_finite = RemoveNotFinite(0.9) step_rule = CompositeRule([step_rule, remove_not_finite]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) print("Extensions") extensions = [] #Monitoring monitor = DataStreamMonitoring(variables=[cost, error], data_stream=test_stream, prefix="test") extensions.append(monitor) def filename(suffix=""): prefix = jobid if jobid else str(os.getpid()) ctime = str(time.time()) return "checkpoints/" + prefix + "_" + ctime + "_" + suffix + ".zip" #Serialization #serialization = Checkpoint(filename()) #extensions.append(serialization) notification = "test_" + error.name track = TrackTheBest(notification) best_notification = track.notification_name checkpointbest = SaveBest(best_notification, filename("best")) extensions.extend([track, checkpointbest]) if early_stopping: print("Early stopping") stopper = FinishIfNoImprovementAfterPlus(best_notification) extensions.append(stopper) #Other extensions if finish != None: print("Force finish ", finish) extensions.append(FinishAfter(after_n_epochs=finish)) if duration != None: print("Stop after ", duration, " seconds") extensions.append(FinishAfterTime(duration)) extensions.extend([Timing(), Printing()]) #Main loop main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) print("Main loop start") main_loop.run()
def apply(self, input_labeled, target_labeled, input_unlabeled): self.target_labeled = target_labeled self.layer_counter = 0 input_dim = self.p.encoder_layers[0] # Store the dimension tuples in the same order as layers. layers = self.layers self.layer_dims = {0: input_dim} self.lr = self.default_lr self.costs = costs = AttributeDict() self.costs.denois = AttributeDict() self.act = AttributeDict() self.error = AttributeDict() top = len(layers) - 1 N = input_labeled.shape[0] self.join = lambda l, u: T.concatenate([l, u], axis=0) self.labeled = lambda x: x[:N] if x is not None else x self.unlabeled = lambda x: x[N:] if x is not None else x self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) input_concat = self.join(input_labeled, input_unlabeled) def encoder(input_, path_name, input_noise_std=0, noise_std=[]): h = input_ logger.info(' 0: noise %g' % input_noise_std) if input_noise_std > 0.: h = h + self.noise_like(h) * input_noise_std d = AttributeDict() d.unlabeled = self.new_activation_dict() d.labeled = self.new_activation_dict() d.labeled.z[0] = self.labeled(h) d.unlabeled.z[0] = self.unlabeled(h) prev_dim = input_dim for i, (spec, _, act_f) in layers[1:]: d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h) noise = noise_std[i] if i < len(noise_std) else 0. curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f, path_name=path_name, noise_std=noise) assert self.layer_dims.get(i) in (None, curr_dim) self.layer_dims[i] = curr_dim d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z) d.unlabeled.s[i] = s d.unlabeled.m[i] = m prev_dim = curr_dim d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h) return d # Clean, supervised logger.info('Encoder: clean, labeled') clean = self.act.clean = encoder(input_concat, 'clean') # Corrupted, supervised logger.info('Encoder: corr, labeled') corr = self.act.corr = encoder(input_concat, 'corr', input_noise_std=self.p.super_noise_std, noise_std=self.p.f_local_noise_std) est = self.act.est = self.new_activation_dict() # Decoder path in opposite order logger.info('Decoder: z_corr -> z_est') for i, ((_, spec), l_type, act_f) in layers[::-1]: z_corr = corr.unlabeled.z[i] z_clean = clean.unlabeled.z[i] z_clean_s = clean.unlabeled.s.get(i) z_clean_m = clean.unlabeled.m.get(i) fspec = layers[i+1][1][0] if len(layers) > i+1 else (None, None) if i == top: ver = corr.unlabeled.h[i] ver_dim = self.layer_dims[i] top_g = True else: ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], l_type=l_type, num=i, fspec=fspec, top_g=top_g) if z_est is not None: # Denoising cost if z_clean_s and self.p.zestbn == 'bugfix': z_est_norm = (z_est - z_clean_m) / T.sqrt(z_clean_s + np.float32(1e-10)) elif z_clean_s is None or self.p.zestbn == 'no': z_est_norm = z_est else: assert False, 'Not supported path' se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) denois_print = 'denois %.2f' % self.p.denoising_cost_x[i] else: denois_print = '' # Store references for later use est.h[i] = self.apply_act(z_est, act_f) est.z[i] = z_est est.s[i] = None est.m[i] = None logger.info(' g%d: %10s, %s, dim %s -> %s' % ( i, l_type, denois_print, self.layer_dims.get(i+1), self.layer_dims.get(i) )) # Costs y = target_labeled.flatten() costs.class_clean = CategoricalCrossEntropy().apply(y, clean.labeled.h[top]) costs.class_clean.name = 'cost_class_clean' costs.class_corr = CategoricalCrossEntropy().apply(y, corr.labeled.h[top]) costs.class_corr.name = 'cost_class_corr' # This will be used for training costs.total = costs.class_corr * 1.0 for i in range(top + 1): if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0: costs.total += costs.denois[i] * self.p.denoising_cost_x[i] costs.total.name = 'cost_total' # Classification error mr = MisclassificationRate() self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.error.clean.name = 'error_rate_clean'
mlp.weights_init = Uniform(0.0, 0.01) mlp.biases_init = Constant(0.0) mlp.initialize() lin = Linear(200, 10, use_bias=True) lin.weights_init = Uniform(0.0, 0.01) lin.biases_init = Constant(0.0) lin.initialize() train_out = lin.apply(mlp.apply(flat_x)) test_out = lin.apply(mlp.apply(flat_x)) sm = Softmax(name='softmax') loss = sm.categorical_cross_entropy(flat_y, train_out).mean() loss.name = 'nll' misclass = MisclassificationRate().apply(flat_y, train_out) misclass.name = 'misclass' test_loss = sm.categorical_cross_entropy(flat_y, test_out).mean() test_loss.name = 'nll' test_misclass = MisclassificationRate().apply(flat_y, test_out) test_misclass.name = 'misclass' model = Model(loss) ###################### # Data ###################### import numpy #from mnist import MNIST from fuel.datasets.mnist import MNIST
def train(train_set, test_set): x = tensor.matrix('features') y = tensor.lmatrix('targets') l1 = Linear( name='input_to_hidden', input_dim=2, output_dim=3, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l1.initialize() h = Logistic().apply(l1.apply(x)) l2 = Linear( name='hidden_to_output', input_dim=l1.output_dim, output_dim=2, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l2.initialize() y_hat = Softmax().apply(l2.apply(h)) cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'misclassification_rate' cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 1e-8 * (W1 ** 2).sum() + 1e-8 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' print('W1', W1.get_value()) print('W2', W2.get_value()) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=RMSProp() ) data_stream_train = Flatten( DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, batch_size=4) ) ) data_stream_test = Flatten( DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size=1) ) ) monitor = DataStreamMonitoring( variables=[cost, error], data_stream=data_stream_test, prefix="test" ) main_loop = MainLoop( data_stream=data_stream_train, algorithm=algorithm, extensions=[ monitor, FinishAfter(after_n_epochs=100), Printing(), # ProgressBar() ] ) main_loop.run()
def train_snli_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed, model='simple'): if config['exclude_top_k'] > config['num_input_words'] and config[ 'num_input_words'] > 0: raise Exception("Some words have neither word nor def embedding") c = config logger = configure_logger(name="snli_baseline_training", log_file=os.path.join(save_path, "log.txt")) if not os.path.exists(save_path): logger.info("Start a new job") os.mkdir(save_path) else: logger.info("Continue an existing job") with open(os.path.join(save_path, "cmd.txt"), "w") as f: f.write(" ".join(sys.argv)) # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) main_loop_path = os.path.join(save_path, 'main_loop.tar') main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar') stream_path = os.path.join(save_path, 'stream.pkl') # Save config to save_path json.dump(config, open(os.path.join(save_path, "config.json"), "w")) if model == 'simple': nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data( c) elif model == 'esim': nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data( c) else: raise NotImplementedError() # Compute cost s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: assert os.path.exists(c['dict_path']) s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') y = T.ivector('label') cg = {} for train_phase in [True, False]: # NOTE: Please don't change outputs of cg if train_phase: with batch_normalization(nli_model): pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) else: pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) cost = CategoricalCrossEntropy().apply(y.flatten(), pred) error_rate = MisclassificationRate().apply(y.flatten(), pred) cg[train_phase] = ComputationGraph([cost, error_rate]) # Weight decay (TODO: Make it less bug prone) if model == 'simple': weights_to_decay = VariableFilter( bricks=[dense for dense, relu, bn in nli_model._mlp], roles=[WEIGHT])(cg[True].variables) weight_decay = np.float32(c['l2']) * sum( (w**2).sum() for w in weights_to_decay) elif model == 'esim': weight_decay = 0.0 else: raise NotImplementedError() final_cost = cg[True].outputs[0] + weight_decay final_cost.name = 'final_cost' # Add updates for population parameters if c.get("bn", True): pop_updates = get_batch_normalization_updates(cg[True]) extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates] else: pop_updates = [] extra_updates = [] if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: loaded_params = load_parameters(src) cg[True].set_parameter_values(loaded_params) for param, m in pop_updates: param.set_value(loaded_params[get_brick( param).get_hierarchical_name(param)]) if os.path.exists(os.path.join(save_path, "main_loop.tar")): logger.warning("Manually loading BN stats :(") with open(os.path.join(save_path, "main_loop.tar")) as src: loaded_params = load_parameters(src) for param, m in pop_updates: param.set_value( loaded_params[get_brick(param).get_hierarchical_name(param)]) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4).get_epoch_iterator()) s1.tag.test_value = test_value_data[0] s1_mask.tag.test_value = test_value_data[1] s2.tag.test_value = test_value_data[2] s2_mask.tag.test_value = test_value_data[3] y.tag.test_value = test_value_data[4] # Freeze embeddings if not c['train_emb']: frozen_params = [ p for E in nli_model.get_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params) & set(train_params)) > 0 else: frozen_params = [] if not c.get('train_def_emb', 1): frozen_params_def = [ p for E in nli_model.get_def_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params_def) & set(train_params)) > 0 frozen_params += frozen_params_def train_params = [p for p in cg[True].parameters if p not in frozen_params] train_params_keys = [ get_brick(p).get_hierarchical_name(p) for p in train_params ] # Optimizer algorithm = GradientDescent(cost=final_cost, on_unused_sources='ignore', parameters=train_params, step_rule=Adam(learning_rate=c['lr'])) algorithm.add_updates(extra_updates) m = Model(final_cost) parameters = m.get_parameter_dict() # Blocks version mismatch logger.info("Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(train_params_keys)], width=120)) logger.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted(train_params_keys) ]))) ### Monitored args ### train_monitored_vars = [final_cost] + cg[True].outputs monitored_vars = cg[False].outputs val_acc = monitored_vars[1] to_monitor_names = [ 'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2', 's1_gate_rootmean2', 's1_compose_gate_rootmean2' ] for k in to_monitor_names: train_v, valid_v = VariableFilter(name=k)( cg[True]), VariableFilter(name=k)(cg[False]) if len(train_v): logger.info("Adding {} tracking".format(k)) train_monitored_vars.append(train_v[0]) monitored_vars.append(valid_v[0]) else: logger.warning("Didnt find {} in cg".format(k)) if c['monitor_parameters']: for name in train_params_keys: param = parameters[name] num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements grad_norm = algorithm.gradients[param].norm(2) / num_elements step_norm = algorithm.steps[param].norm(2) / num_elements stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' train_monitored_vars.append(stats) regular_training_stream = data.get_stream('train', batch_size=c['batch_size'], seed=seed) if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=regular_training_stream.sources, hwm=100, produces_examples=regular_training_stream.produces_examples) else: training_stream = regular_training_stream ### Build extensions ### extensions = [ # Load(main_loop_path, load_iteration_state=True, load_log=True) # .set_conditions(before_training=not new_training_job), StartFuelServer(regular_training_stream, stream_path, hwm=100, script_path=os.path.join( os.path.dirname(__file__), "../bin/start_fuel_server.py"), before_training=fuel_server), Timing(every_n_batches=c['mon_freq']), ProgressBar(), RetrievalPrintStats(retrieval=used_retrieval, every_n_batches=c['mon_freq_valid'], before_training=not fast_start), Timestamp(), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq']), ] if c['layout'] == 'snli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid', batch_size=14, seed=seed), before_training=not fast_start, on_resumption=True, after_training=True, every_n_batches=c['mon_freq_valid'], prefix='valid') extensions.append(validation) elif c['layout'] == 'mnli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid_matched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], on_resumption=True, after_training=True, prefix='valid_matched') validation_mismatched = DataStreamMonitoring( monitored_vars, data.get_stream('valid_mismatched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], before_training=not fast_start, on_resumption=True, after_training=True, prefix='valid_mismatched') extensions.extend([validation, validation_mismatched]) else: raise NotImplementedError() # Similarity trackers for embeddings if len(c.get('vocab_def', '')): retrieval_vocab = Vocabulary(c['vocab_def']) else: retrieval_vocab = data.vocab retrieval_all = Retrieval(vocab_text=retrieval_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) for name in [ 's1_word_embeddings', 's1_dict_word_embeddings', 's1_translated_word_embeddings' ]: variables = VariableFilter(name=name)(cg[False]) if len(variables): s1_emb = variables[0] logger.info("Adding similarity tracking for " + name) # A bit sloppy about downcast if "dict" in name: embedder = construct_dict_embedder(theano.function( [s1, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), vocab=data.vocab, retrieval=retrieval_all) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) else: embedder = construct_embedder(theano.function( [s1], s1_emb, allow_input_downcast=True), vocab=data.vocab) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) track_the_best = TrackTheBest(validation.record_name(val_acc), before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start, every_n_batches=c['mon_freq_valid'], choose_best=min) extensions.append(track_the_best) # Special care for serializing embeddings if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path', '')): extensions.insert( 0, LoadNoUnpickling(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=train_params + [p for p, m in pop_updates], save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) else: extensions.insert( 0, Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=cg[True].parameters + [p for p, m in pop_updates], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) extensions.extend([ DumpCSVSummaries(save_path, every_n_batches=c['mon_freq_valid'], after_training=True), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_valid'], after_training=True), Printing(every_n_batches=c['mon_freq_valid']), PrintMessage(msg="save_path={}".format(save_path), every_n_batches=c['mon_freq']), FinishAfter(after_n_batches=c['n_batches']).add_condition( ['after_batch'], OnLogStatusExceed('iterations_done', c['n_batches'])) ]) logger.info(extensions) ### Run training ### if "VISDOM_SERVER" in os.environ: print("Running visdom server") ret = subprocess.Popen([ os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"), "--visdom-server={}".format(os.environ['VISDOM_SERVER']), "--folder={}".format(save_path) ]) time.sleep(0.1) if ret.returncode is not None: raise Exception() atexit.register(lambda: os.kill(ret.pid, signal.SIGINT)) model = Model(cost) for p, m in pop_updates: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p main_loop = MainLoop(algorithm, training_stream, model=model, extensions=extensions) assert os.path.exists(save_path) main_loop.run()
from blocks.roles import WEIGHT from blocks.bricks.cost import MisclassificationRate x = tensor.matrix('features') y = tensor.lmatrix('targets') lin1 = Linear(name='lin1', input_dim=126, output_dim=50, weights_init=Constant(0.005), biases_init=Constant(0)) act1_sigmoid = Logistic().apply(lin1.apply(x)) lin2 = Linear(name='lin2', input_dim=50, output_dim=2, weights_init=Constant(0.001), biases_init=Constant(0)) act2_softmax = Softmax().apply(lin2.apply(act1_sigmoid)) lin1.initialize() lin2.initialize() missclass = MisclassificationRate().apply(y.argmax(axis=1), act2_softmax) missclass.name = 'missclassification' cost = CategoricalCrossEntropy().apply(y, act2_softmax) comp_graph = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(comp_graph.variables) cost = cost + 0.005 * (W1**2).sum() + 0.005 * (W2**2).sum() cost.name = 'cost' from blocks.algorithms import GradientDescent, Scale from blocks.extensions import FinishAfter, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring from fuel.transformers import Flatten
def apply(self, input_labeled, target_labeled, input_unlabeled): self.layer_counter = 0 input_dim = self.p.encoder_layers[0] # Store the dimension tuples in the same order as layers. layers = self.layers self.layer_dims = {0: input_dim} self.lr = self.shared(self.default_lr, 'learning_rate', role=None) self.costs = costs = AttributeDict() self.costs.denois = AttributeDict() self.act = AttributeDict() self.error = AttributeDict() top = len(layers) - 1 N = input_labeled.shape[0] self.join = lambda l, u: T.concatenate([l, u], axis=0) self.labeled = lambda x: x[:N] if x is not None else x self.unlabeled = lambda x: x[N:] if x is not None else x self.split_lu = lambda x: (self.labeled(x), self.unlabeled(x)) input_concat = self.join(input_labeled, input_unlabeled) def encoder(input_, path_name, input_noise_std=0, noise_std=[]): h = input_ logger.info(' 0: noise %g' % input_noise_std) if input_noise_std > 0.: h = h + self.noise_like(h) * input_noise_std d = AttributeDict() d.unlabeled = self.new_activation_dict() d.labeled = self.new_activation_dict() d.labeled.z[0] = self.labeled(h) d.unlabeled.z[0] = self.unlabeled(h) prev_dim = input_dim for i, (spec, _, act_f) in layers[1:]: d.labeled.h[i - 1], d.unlabeled.h[i - 1] = self.split_lu(h) noise = noise_std[i] if i < len(noise_std) else 0. curr_dim, z, m, s, h = self.f(h, prev_dim, spec, i, act_f, path_name=path_name, noise_std=noise) assert self.layer_dims.get(i) in (None, curr_dim) self.layer_dims[i] = curr_dim d.labeled.z[i], d.unlabeled.z[i] = self.split_lu(z) d.unlabeled.s[i] = s d.unlabeled.m[i] = m prev_dim = curr_dim d.labeled.h[i], d.unlabeled.h[i] = self.split_lu(h) return d # Clean, supervised logger.info('Encoder: clean, labeled') clean = self.act.clean = encoder(input_concat, 'clean') # Corrupted, supervised logger.info('Encoder: corr, labeled') corr = self.act.corr = encoder(input_concat, 'corr', input_noise_std=self.p.super_noise_std, noise_std=self.p.f_local_noise_std) est = self.act.est = self.new_activation_dict() # Decoder path in opposite order logger.info('Decoder: z_corr -> z_est') for i, ((_, spec), l_type, act_f) in layers[::-1]: z_corr = corr.unlabeled.z[i] z_clean = clean.unlabeled.z[i] z_clean_s = clean.unlabeled.s.get(i) z_clean_m = clean.unlabeled.m.get(i) fspec = layers[i + 1][1][0] if len(layers) > i + 1 else (None, None) if i == top: ver = corr.unlabeled.h[i] ver_dim = self.layer_dims[i] top_g = True else: ver = est.z.get(i + 1) ver_dim = self.layer_dims.get(i + 1) top_g = False z_est = self.g(z_lat=z_corr, z_ver=ver, in_dims=ver_dim, out_dims=self.layer_dims[i], l_type=l_type, num=i, fspec=fspec, top_g=top_g) if z_est is not None: # Denoising cost if z_clean_s and self.p.zestbn == 'bugfix': z_est_norm = (z_est - z_clean_m ) / T.sqrt(z_clean_s + np.float32(1e-10)) elif z_clean_s is None or self.p.zestbn == 'no': z_est_norm = z_est else: assert False, 'Not supported path' se = SquaredError('denois' + str(i)) costs.denois[i] = se.apply(z_est_norm.flatten(2), z_clean.flatten(2)) \ / np.prod(self.layer_dims[i], dtype=floatX) costs.denois[i].name = 'denois' + str(i) denois_print = 'denois %.2f' % self.p.denoising_cost_x[i] else: denois_print = '' # Store references for later use est.h[i] = self.apply_act(z_est, act_f) est.z[i] = z_est est.s[i] = None est.m[i] = None logger.info(' g%d: %10s, %s, dim %s -> %s' % (i, l_type, denois_print, self.layer_dims.get(i + 1), self.layer_dims.get(i))) # Costs y = target_labeled.flatten() costs.class_clean = CategoricalCrossEntropy().apply( y, clean.labeled.h[top]) costs.class_clean.name = 'cost_class_clean' costs.class_corr = CategoricalCrossEntropy().apply( y, corr.labeled.h[top]) costs.class_corr.name = 'cost_class_corr' # This will be used for training costs.total = costs.class_corr * 1.0 for i in range(top + 1): if costs.denois.get(i) and self.p.denoising_cost_x[i] > 0: costs.total += costs.denois[i] * self.p.denoising_cost_x[i] costs.total.name = 'cost_total' # Classification error mr = MisclassificationRate() self.error.clean = mr.apply(y, clean.labeled.h[top]) * np.float32(100.) self.error.clean.name = 'error_rate_clean'
def train_paired_dnn(train_x, train_y, dev_x, dev_y, test_x, test_y): train_y = train_y.flatten().astype(int) dev_y = dev_y.flatten().astype(int) test_y = test_y.flatten().astype(int) batch_size = 256 n_train, in_dim = train_x.shape n_dev = dev_x.shape[0] n_test = test_x.shape[0] hid_dims = 2 * np.array([512, 512, 512, 512]) out_dim = 1 ds_train = make_ds(train_x, train_y, batch_size, n_train, SequentialScheme) ds_dev = make_ds(dev_x, dev_y, batch_size, n_dev, SequentialScheme) ds_test = make_ds(test_x, test_y, batch_size, n_test, SequentialScheme) mlp = MLP( activations=[Rectifier(), Rectifier(), Rectifier(), Rectifier(), Logistic()], dims=[in_dim, hid_dims[0], hid_dims[1], hid_dims[2], hid_dims[3], out_dim], weights_init=Uniform(mean=0, width=1/32), biases_init=Constant(0) ) mlp.initialize() x = tensor.matrix('features') y = tensor.matrix('targets', dtype='int64') y_hat = mlp.apply(x) model = Model(y_hat) cost = MyBinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' misrate = MisclassificationRate().apply(y.flatten(), y_hat) misrate.name = 'misclassfication' cg = ComputationGraph([cost, misrate]) drop_vars = VariableFilter( roles=[INPUT], bricks=mlp.linear_transformations[1:] )(cg.variables) cg_dropout = apply_dropout(cg, drop_vars, 0.2) cost_dropout, error_rate_dropout = cg_dropout.outputs learning_rate = 0.0015 momentum = 0.9 step_rule = CompositeRule([ Momentum(learning_rate=learning_rate, momentum=momentum), AdaGrad(learning_rate=learning_rate) ]) algorithm = GradientDescent(cost=cost_dropout, parameters=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost_dropout, error_rate_dropout, aggregation.mean(algorithm.total_gradient_norm)], after_epoch=True, prefix="train" ) monitor_dev = DataStreamMonitoring( # variables=[cost_dropout, error_rate_dropout], variables=[cost, misrate], data_stream=ds_dev, prefix="dev" ) monitor_test = DataStreamMonitoring( # variables=[cost_dropout, error_rate_dropout], variables=[cost, misrate], data_stream=ds_test, prefix="test" ) track_str = 'train_{0}'.format(cost_dropout.name) track_best_str = '{0}_best_so_far'.format(track_str) print track_str, track_best_str n_epochs = 2 print 'n_epochs:', n_epochs main_loop = MainLoop( model=model, data_stream=ds_train, algorithm=algorithm, extensions=[Timing(), monitor_train, monitor_dev, monitor_test, TrackTheBest(track_str), Checkpoint("best_model.pkl", use_cpickle = True ).add_condition(['after_epoch'], predicate=OnLogRecord(track_best_str)), FinishAfter(after_n_epochs=n_epochs), # FinishIfNoImprovementAfter(track_best_str, epochs=n_epochs), Printing()] ) main_loop.run() acc([x], y_hat, train_x, train_y, 'train') acc([x], y_hat, dev_x, dev_y, 'dev') acc([x], y_hat, test_x, test_y, 'test')
weights_init=Uniform(width=0.2), biases_init=Constant(0)) ########## hyper parameters########################################### # We push initialization config to set different initialization schemes convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=0.2) convnet.layers[1].weights_init = Uniform(width=0.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=0.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=0.11) convnet.initialize() #########################################################333 #Generate output and error signal predict = convnet.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) ########### ALGORITHM of training############# algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid"), TrainingDataMonitoring(
def test_communication(path_vae_mnist, path_maxout_mnist): # load models vae_mnist = load(path_vae_mnist) # get params : to be remove from the computation graph # write an object maxout classifier = Maxout() # get params : to be removed from the computation graph # vae whose prior is a zero mean unit variance normal distribution activation = Rectifier() full_weights_init = Orthogonal() weights_init = full_weights_init # SVHN en niveau de gris layers = [32*32, 200, 200, 200, 50] encoder_layers = layers[:-1] encoder_mlp = MLP([activation] * (len(encoder_layers)-1), encoder_layers, name="MLP_SVHN_encode", biases_init=Constant(0.), weights_init=weights_init) enc_dim = encoder_layers[-1] z_dim = layers[-1] sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, biases_init=Constant(0.), weights_init=full_weights_init) decoder_layers = layers[:] ## includes z_dim as first layer decoder_layers.reverse() decoder_mlp = MLP([activation] * (len(decoder_layers)-2) + [Rectifier()], decoder_layers, name="MLP_SVHN_decode", biases_init=Constant(0.), weights_init=weights_init) vae_svhn = VAEModel(encoder_mlp, sampler, decoder_mlp) vae_svhn.initialize() # do the connection x = T.tensor4('x') # SVHN samples preprocessed with local contrast normalization x_ = (T.sum(x, axis=1)).flatten(ndim=2) y = T.imatrix('y') batch_size = 512 svhn_z, _ = vae_svhn.sampler.sample(vae_svhn.encoder_mlp.apply(x_)) mnist_decode = vae_mnist.decoder_mlp.apply(svhn_z) # reshape shape = mnist_decode.shape mnist_decode = mnist_decode.reshape((shape[0], 1, 28, 28)) prediction = classifier.apply(mnist_decode) y_hat = Softmax().apply(prediction) x_recons, kl_terms = vae_svhn.reconstruct(x_) recons_term = BinaryCrossEntropy().apply(x_, T.clip(x_recons, 1e-4, 1 - 1e-4)) recons_term.name = "recons_term" cost_A = recons_term + kl_terms.mean() cost_A.name = "cost_A" cost_B = Softmax().categorical_cross_entropy(y.flatten(), prediction) cost_B.name = 'cost_B' cost = cost_B cost.name = "cost" cg = ComputationGraph(cost) # probably discard some of the parameters parameters = cg.parameters params = [] for t in parameters: if not re.match(".*mnist", t.name): params.append(t) """ f = theano.function([x], cost_A) value_x = np.random.ranf((1, 3, 32, 32)).astype("float32") print f(value_x) return """ error_brick = MisclassificationRate() error_rate = error_brick.apply(y.flatten(), y_hat) error_rate.name = "error_rate" # training here step_rule = RMSProp(0.001,0.99) dataset_hdf5_file="/Tmp/ducoffem/SVHN/" train_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='train') test_set = H5PYDataset(os.path.join(dataset_hdf5_file, "all.h5"), which_set='valid') data_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme(train_set.num_examples, batch_size)) data_stream_test = DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(2000, batch_size)) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost], prefix="train", every_n_batches=10) monitor_valid = DataStreamMonitoring( variables=[cost, error_rate], data_stream=data_stream_test, prefix="valid", every_n_batches=10) # drawing_samples = ImagesSamplesSave("../data_svhn", vae, (3, 32, 32), every_n_epochs=1) extensions = [ monitor_train, monitor_valid, FinishAfter(after_n_batches=10000), Printing(every_n_batches=10) ] main_loop = MainLoop(data_stream=data_stream, algorithm=algorithm, model = Model(cost), extensions=extensions) main_loop.run()
def build_and_run(experimentconfig, modelconfig, save_to=None): #modelconfig, """ part of this is adapted from lasagne tutorial""" # Prepare Theano variables for inputs and targets input_var = T.tensor4('image_features') target_var = T.lmatrix('targets') target_vec = T.extra_ops.to_one_hot(target_var[:,0],2) # Create vgg model print("Building model...") image_size = modelconfig['image_size'] network = vgg16.build_small_model() prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network["prob"],input_var)) # test_prediction = lasagne.layers.get_output(network["prob"],input_var,deterministic=True) # Loss function -> The objective to minimize print("Instanciation of loss function...") # loss = lasagne.objectives.categorical_crossentropy(prediction, target_var.flatten()) loss = lasagne.objectives.squared_error(prediction,target_vec) # test_loss = lasagne.objectives.squared_error(test_prediction,target_vec) loss = loss.mean() # layers = network.values() #l1 and l2 regularization # pondlayers = {x:0.01 for x in layers} # l1_penality = lasagne.regularization.regularize_layer_params_weighted(pondlayers, lasagne.regularization.l2) # l2_penality = lasagne.regularization.regularize_layer_params(layers[len(layers)/4:], lasagne.regularization.l1) * 1e-4 # reg_penalty = l1_penality + l2_penality # reg_penalty.name = 'reg_penalty' #loss = loss + reg_penalty loss.name = 'loss' error_rate = MisclassificationRate().apply(target_var.flatten(), prediction).copy( name='error_rate') # Load the dataset print("Loading data...") if 'test' in experimentconfig.keys() and experimentconfig['test'] is True: train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=True) else : train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=False) # Defining step rule and algorithm if 'step_rule' in experimentconfig.keys() and not experimentconfig['step_rule'] is None : step_rule = experimentconfig['step_rule'](learning_rate=experimentconfig['learning_rate']) else : step_rule=Scale(learning_rate=experimentconfig['learning_rate']) params = map(lasagne.utils.as_theano_expression,lasagne.layers.get_all_params(network['prob'], trainable=True)) algorithm = GradientDescent( cost=loss, gradients={var:T.grad(loss,var) for var in params}, step_rule=step_rule) grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name='grad_norm' print("Initializing extensions...") plot = Plot(save_to, channels=[['train_loss','valid_loss','train_grad_norm'],['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042') checkpoint = Checkpoint('models/best_'+save_to+'.tar') # checkpoint.add_condition(['after_n_batches=25'], checkpoint.add_condition(['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Defining extensions extensions = [Timing(), FinishAfter(after_n_epochs=experimentconfig['num_epochs'], after_n_batches=experimentconfig['num_batches']), TrainingDataMonitoring([loss, error_rate, grad_norm, reg_penalty], prefix="train", after_epoch=True), #after_n_epochs=1 DataStreamMonitoring([loss, error_rate],valid_stream,prefix="valid", after_epoch=True), #after_n_epochs=1 #Checkpoint(save_to,after_n_epochs=5), #ProgressBar(), plot, # after_batch=True), Printing(after_epoch=True), TrackTheBest('valid_error_rate',min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=5)] # Early-stopping # model = Model(ComputationGraph(network)) main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) print("Starting main loop...") main_loop.run()
def main(save_to, cost_name, learning_rate, momentum, num_epochs): mlp = MLP([None], [784, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') scores = mlp.apply(x) batch_size = y.shape[0] indices = tensor.arange(y.shape[0]) target_scores = tensor.set_subtensor( tensor.zeros((batch_size, 10))[indices, y.flatten()], 1) score_diff = scores - target_scores # Logistic Regression if cost_name == 'lr': cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean() # MSE elif cost_name == 'mse': cost = (score_diff ** 2).mean() # Perceptron elif cost_name == 'perceptron': cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean() # TLE elif cost_name == 'minmin': cost = abs(score_diff[indices, y.flatten()]).mean() cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean() # TLEcut elif cost_name == 'minmin_cut': # Score of the groundtruth should be greater or equal than its target score cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean() # Score of the prediction should be less or equal than its actual score cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean() # TLE2 elif cost_name == 'minmin2': cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()]) ** 2).mean() cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)]) ** 2).mean() # Direct loss minimization elif cost_name == 'direct': epsilon = 0.1 cost = (- scores[indices, (scores + epsilon * target_scores).argmax(axis=1)] + scores[indices, scores.argmax(axis=1)]).mean() cost /= epsilon elif cost_name == 'svm': cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] - scores[indices, y.flatten()]).mean() else: raise ValueError("Unknown cost " + cost) error_rate = MisclassificationRate().apply(y.flatten(), scores) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) cost.name = 'cost' mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) if learning_rate == None: learning_rate = 0.0001 if momentum == None: momentum = 0.0 rule = Momentum(learning_rate=learning_rate, momentum=momentum) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=rule) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), # CallbackExtension( # lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9), # after_epoch=True), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_cost', 'test_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run() df = pandas.DataFrame.from_dict(main_loop.log, orient='index') res = {'cost' : cost_name, 'learning_rate' : learning_rate, 'momentum' : momentum, 'train_cost' : df.train_cost.iloc[-1], 'test_cost' : df.test_cost.iloc[-1], 'best_test_cost' : df.test_cost.min(), 'train_error' : df.train_error_rate.iloc[-1], 'test_error' : df.test_error_rate.iloc[-1], 'best_test_error' : df.test_error_rate.min()} res = {k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items()} json.dump(res, sys.stdout) sys.stdout.flush()
def main(save_to, save_freq, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None): if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (28, 28) output_size = 10 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 1, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if (isinstance(layer, Activation)): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost') error_rate = MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate') cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring([cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), CheckpointBlock(save_to, every_n_batches=save_freq), ProgressBar(), Printing() ] model = Model(cost) main_loop = MainLoop(algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.run()
def train_net(net, train_stream, test_stream, L1 = None, L2=None, early_stopping=False, finish=None, dropout=False, jobid=None, update=None, duration= None, **ignored): x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') y_hat = net.apply(x) #Cost cost_before = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost_before.name = "cost_without_regularization" #Error #Taken from brodesf error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = "Misclassification rate" #Regularization cg = ComputationGraph(cost_before) WS = VariableFilter(roles=[WEIGHT])(cg.variables) if dropout: print("Dropout") cg = apply_dropout(cg, WS, 0.5) if L1: print("L1 with lambda ",L1) L1_reg = L1 * sum([abs(W).sum() for W in WS]) L1_reg.name = "L1 regularization" cost_before += L1_reg if L2: print("L2 with lambda ",L2) L2_reg = L2 * sum([(W ** 2).sum() for W in WS]) L2_reg.name = "L2 regularization" cost_before += L2_reg cost = cost_before cost.name = 'cost_with_regularization' #Initialization print("Initilization") net.initialize() #Algorithm step_rule = Scale(learning_rate=0.1) if update is not None: if update == "rmsprop": print("Using RMSProp") step_rule = RMSProp() remove_not_finite = RemoveNotFinite(0.9) step_rule = CompositeRule([step_rule, remove_not_finite]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) print("Extensions") extensions = [] #Monitoring monitor = DataStreamMonitoring(variables=[cost, error], data_stream=test_stream, prefix="test") extensions.append(monitor) def filename(suffix=""): prefix = jobid if jobid else str(os.getpid()) ctime = str(time.time()) return "checkpoints/" + prefix + "_" + ctime + "_" + suffix + ".zip" #Serialization #serialization = Checkpoint(filename()) #extensions.append(serialization) notification = "test_"+error.name track = TrackTheBest(notification) best_notification = track.notification_name checkpointbest = SaveBest(best_notification, filename("best")) extensions.extend([track, checkpointbest]) if early_stopping: print("Early stopping") stopper = FinishIfNoImprovementAfterPlus(best_notification) extensions.append(stopper) #Other extensions if finish != None: print("Force finish ", finish) extensions.append(FinishAfter(after_n_epochs=finish)) if duration != None: print("Stop after " , duration, " seconds") extensions.append(FinishAfterTime(duration)) extensions.extend([ Timing(), Printing() ]) #Main loop main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) print("Main loop start") main_loop.run()
def main(num_epochs=50, batch_normalized=True, alpha=0.1): """Run the example. Parameters ---------- num_epochs : int, optional Number of epochs for which to train. batch_normalized : bool, optional Batch-normalize the training graph. Defaults to `True`. alpha : float, optional Weight to apply to a new sample when calculating running averages for population statistics (1 - alpha weight is given to the existing average). """ if batch_normalized: # Add an extra keyword argument that only BatchNormalizedMLP takes, # in order to speed things up at the cost of a bit of extra memory. mlp_class = BatchNormalizedMLP extra_kwargs = {'conserve_memory': False} else: mlp_class = MLP extra_kwargs = {} mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()], [2, 5, 5, 5, 3], weights_init=IsotropicGaussian(0.2), biases_init=Constant(0.), **extra_kwargs) mlp.initialize() # Generate a dataset with 3 spiral arms, using 8000 examples for # training and 2000 for testing. dataset = Spiral(num_examples=10000, classes=3, sources=['features', 'label'], noise=0.05) train_stream = DataStream(dataset, iteration_scheme=ShuffledScheme(examples=8000, batch_size=20)) test_stream = DataStream(dataset, iteration_scheme=SequentialScheme( examples=list(range(8000, 10000)), batch_size=2000)) # Build a cost graph; this contains BatchNormalization bricks that will # by default run in inference mode. features = tensor.matrix('features') label = tensor.lvector('label') prediction = mlp.apply(features) cost = CategoricalCrossEntropy().apply(label, prediction) misclass = MisclassificationRate().apply(label, prediction) misclass.name = 'misclass' # The default name for this is annoyingly long original_cg = ComputationGraph([cost, misclass]) if batch_normalized: cg = apply_batch_normalization(original_cg) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] else: cg = original_cg extra_updates = [] algorithm = GradientDescent(step_rule=Adam(0.001), cost=cg.outputs[0], parameters=cg.parameters) algorithm.add_updates(extra_updates) main_loop = MainLoop(algorithm=algorithm, data_stream=train_stream, # Use the original cost and misclass variables so # that we monitor the (original) inference-mode graph. extensions=[DataStreamMonitoring([cost, misclass], train_stream, prefix='train'), DataStreamMonitoring([cost, misclass], test_stream, prefix='test'), Printing(), FinishAfter(after_n_epochs=num_epochs)]) main_loop.run() return main_loop
def main(feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=None, num_batches=None): if feature_maps is None: feature_maps = [32, 48, 64, 80, 128, 128] if mlp_hiddens is None: mlp_hiddens = [1000] if conv_sizes is None: conv_sizes = [7, 5, 5, 5, 5, 4] if pool_sizes is None: pool_sizes = [3, 2, 2, 2, 2, 1] if batch_size is None: batch_size = 64 conv_steps = [2, 1, 1, 1, 1, 1] # same as stride image_size = (256, 256) output_size = 2 learningRate = 0.001 drop_prob = 0.5 weight_noise = 0.75 num_epochs = 250 num_batches = None host_plot = "http://*****:*****@ %s" % (graph_name, datetime.datetime.now(), socket.gethostname()), channels=[["train_error_rate", "valid_error_rate"], ["train_total_gradient_norm"]], after_epoch=True, server_url=host_plot, ) ) PLOT_AVAILABLE = True except ImportError: PLOT_AVAILABLE = False extensions.append(Checkpoint(save_to, after_epoch=True, after_training=True, save_separately=["log"])) logger.info("Building the model") model = Model(cost) ########### Loading images ##################### main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
class LookUpTrain(Initializable, Feedforward): @lazy(allocation=['dwin', 'n_mot', 'vect_size', 'n_hidden']) def __init__(self, dwin, n_mot, vect_size, n_hidden, n_out=2, **kwargs): self.dwin = dwin self.n_mot = n_mot self.vect_size = vect_size if isinstance(n_hidden, int): self.n_hidden = [n_hidden] else: self.n_hidden = n_hidden self.n_out = n_out self.window = Window(self.dwin, self.n_mot, self.vect_size, self.n_hidden, self.n_out, weights_init=IsotropicGaussian(0.001)) super(LookUpTrain, self).__init__(**kwargs) self.softmax = Softmax() self.error = MisclassificationRate() self.children = [self.window, self.softmax, self.error] @application(inputs=['input_'], outputs=['output']) def apply(self, input_): return self.window.apply(input_) @application(inputs=['x', 'y'], outputs=['output']) def cost(self, x, y): return self.softmax.categorical_cross_entropy(y, self.apply(x)) @application(inputs=['x', 'y'], outputs=['output']) def errors(self, x, y): return self.error.apply(y, self.apply(x)) @application(inputs=['x'], outputs=['output']) def predict(self, x): return T.argmax(self.apply(x), axis=1) @application(inputs=['x'], outputs=['output']) def predict_confidency(self, x): return T.max(self.apply(x), axis=1) def update_lookup_weights(self): self.window.update_lookup_weights() @application(inputs=['input_', 'input_corrupt'], outputs=['output']) def score(self, input_, input_corrupt): # modify the input_ with an incorrect central word ? return (1 - -self.apply(input_)).norm(2) + ( self.apply(input_corrupt)).norm(2) #return T.maximum(0,1 - self.apply(input_)+self.apply(input_corrupt) )[0] return T.maximum(0, 1 - self.apply(input_))[0] + 0.1 * T.maximum( 0, 1 + self.apply(input_corrupt))[0] + 0.1 * T.maximum( 0, 1 - self.apply(input_) + self.apply(input_corrupt))[0] # change that !!!! def _initialize(self): self.window.initialize() @application(inputs=['input_'], outputs=['output']) def embedding(self, input_): return self.window.embedding(input_) def _allocate(self): self.window.allocate() def load(self, repo, filename): params = getParams(self, T.itensor3()) with closing(open(os.path.join(repo, filename), 'rb')) as f: params_value = pickle.load(f) for p, p_value in zip(params, params_value): p.set_value(p_value.get_value()) def get_Params(self): return self.window.get_Params() def save(self, repo, filename): params = getParams(self, T.itensor3()) index = 0 while os.path.isfile(os.path.join(repo, filename + "_" + str(index))): index += 1 filename = filename + "_" + str(index) with closing(open(os.path.join(repo, filename), 'wb')) as f: pickle.dump(params, f, protocol=pickle.HIGHEST_PROTOCOL)
def build_submodel(input_shape, output_dim, L_dim_conv_layers, L_filter_size, L_pool_size, L_activation_conv, L_dim_full_layers, L_activation_full, L_exo_dropout_conv_layers, L_exo_dropout_full_layers, L_endo_dropout_conv_layers, L_endo_dropout_full_layers, L_border_mode=None, L_filter_step=None, L_pool_step=None): # TO DO : target size and name of the features x = T.tensor4('features') y = T.imatrix('targets') assert len(input_shape) == 3, "input_shape must be a 3d tensor" num_channels = input_shape[0] image_size = tuple(input_shape[1:]) print image_size print num_channels prediction = output_dim # CONVOLUTION output_conv = x output_dim = num_channels*np.prod(image_size) conv_layers = [] assert len(L_dim_conv_layers) == len(L_filter_size) if L_filter_step is None: L_filter_step = [None] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_pool_size) if L_pool_step is None: L_pool_step = [None] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_pool_step) assert len(L_dim_conv_layers) == len(L_activation_conv) if L_border_mode is None: L_border_mode = ["valid"] * len(L_dim_conv_layers) assert len(L_dim_conv_layers) == len(L_border_mode) assert len(L_dim_conv_layers) == len(L_endo_dropout_conv_layers) assert len(L_dim_conv_layers) == len(L_exo_dropout_conv_layers) # regarding the batch dropout : the dropout is applied on the filter # which is equivalent to the output dimension # you have to look at the dropout_rate of the next layer # that is why we need to have the first dropout value of L_exo_dropout_full_layers # the first value has to be 0.0 in this context, and we'll # assume that it is, but let's have an assert assert L_exo_dropout_conv_layers[0] == 0.0, "L_exo_dropout_conv_layers[0] has to be 0.0 in this context. There are ways to make it work, of course, but we don't support this with this scripts." # here modifitication of L_exo_dropout_conv_layers L_exo_dropout_conv_layers = L_exo_dropout_conv_layers[1:] + [L_exo_dropout_full_layers[0]] if len(L_dim_conv_layers): for (num_filters, filter_size, filter_step, pool_size, pool_step, activation_str, border_mode, dropout, index) in zip(L_dim_conv_layers, L_filter_size, L_filter_step, L_pool_size, L_pool_step, L_activation_conv, L_border_mode, L_exo_dropout_conv_layers, xrange(len(L_dim_conv_layers)) ): # convert filter_size and pool_size in tuple filter_size = tuple(filter_size) if filter_step is None: filter_step = (1, 1) else: filter_step = tuple(filter_step) if pool_size is None: pool_size = (0,0) else: pool_size = tuple(pool_size) # TO DO : leaky relu if activation_str.lower() == 'rectifier': activation = Rectifier().apply elif activation_str.lower() == 'tanh': activation = Tanh().apply elif activation_str.lower() in ['sigmoid', 'logistic']: activation = Logistic().apply elif activation_str.lower() in ['id', 'identity']: activation = Identity().apply else: raise Exception("unknown activation function : %s", activation_str) assert 0.0 <= dropout and dropout < 1.0 num_filters = num_filters - int(num_filters*dropout) print "border_mode : %s" % border_mode # filter_step # http://blocks.readthedocs.org/en/latest/api/bricks.html#module-blocks.bricks.conv kwargs = {} if filter_step is None or filter_step == (1,1): pass else: # there's a bit of a mix of names because `Convolutional` takes # a "step" argument, but `ConvolutionActivation` takes "conv_step" argument kwargs['conv_step'] = filter_step if (pool_size[0] == 0 and pool_size[1] == 0): layer_conv = ConvolutionalActivation(activation=activation, filter_size=filter_size, num_filters=num_filters, border_mode=border_mode, name="layer_%d" % index, **kwargs) else: if pool_step is None: pass else: kwargs['pooling_step'] = tuple(pool_step) layer_conv = ConvolutionalLayer(activation=activation, filter_size=filter_size, num_filters=num_filters, border_mode=border_mode, pooling_size=pool_size, name="layer_%d" % index, **kwargs) conv_layers.append(layer_conv) convnet = ConvolutionalSequence(conv_layers, num_channels=num_channels, image_size=image_size, weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="conv_section") convnet.push_allocation_config() convnet.initialize() output_dim = np.prod(convnet.get_dim('output')) output_conv = convnet.apply(output_conv) output_conv = Flattener().apply(output_conv) # FULLY CONNECTED output_mlp = output_conv full_layers = [] assert len(L_dim_full_layers) == len(L_activation_full) assert len(L_dim_full_layers) + 1 == len(L_endo_dropout_full_layers) assert len(L_dim_full_layers) + 1 == len(L_exo_dropout_full_layers) # reguarding the batch dropout : the dropout is applied on the filter # which is equivalent to the output dimension # you have to look at the dropout_rate of the next layer # that is why we throw away the first value of L_exo_dropout_full_layers L_exo_dropout_full_layers = L_exo_dropout_full_layers[1:] pre_dim = output_dim print "When constructing the model, the output_dim of the conv section is %d." % output_dim if len(L_dim_full_layers): for (dim, activation_str, dropout, index) in zip(L_dim_full_layers, L_activation_full, L_exo_dropout_full_layers, range(len(L_dim_conv_layers), len(L_dim_conv_layers)+ len(L_dim_full_layers)) ): # TO DO : leaky relu if activation_str.lower() == 'rectifier': activation = Rectifier().apply elif activation_str.lower() == 'tanh': activation = Tanh().apply elif activation_str.lower() in ['sigmoid', 'logistic']: activation = Logistic().apply elif activation_str.lower() in ['id', 'identity']: activation = Identity().apply else: raise Exception("unknown activation function : %s", activation_str) assert 0.0 <= dropout and dropout < 1.0 dim = dim - int(dim*dropout) print "When constructing the fully-connected section, we apply dropout %f to add an MLP going from pre_dim %d to dim %d." % (dropout, pre_dim, dim) layer_full = MLP(activations=[activation], dims=[pre_dim, dim], weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="layer_%d" % index) layer_full.initialize() full_layers.append(layer_full) pre_dim = dim for layer in full_layers: output_mlp = layer.apply(output_mlp) output_dim = L_dim_full_layers[-1] - int(L_dim_full_layers[-1]*L_exo_dropout_full_layers[-1]) # COST FUNCTION output_layer = Linear(output_dim, prediction, weights_init=Uniform(width=0.1), biases_init=Constant(0.0), name="layer_"+str(len(L_dim_conv_layers)+ len(L_dim_full_layers)) ) output_layer.initialize() full_layers.append(output_layer) y_pred = output_layer.apply(output_mlp) y_hat = Softmax().apply(y_pred) # SOFTMAX and log likelihood y_pred = Softmax().apply(y_pred) # be careful. one version expects the output of a softmax; the other expects just the # output of the network cost = CategoricalCrossEntropy().apply(y.flatten(), y_pred) #cost = Softmax().categorical_cross_entropy(y.flatten(), y_pred) cost.name = "cost" # Misclassification error_rate_brick = MisclassificationRate() error_rate = error_rate_brick.apply(y.flatten(), y_hat) error_rate.name = "error_rate" # put names D_params, D_kind = build_params(x, T.matrix(), conv_layers, full_layers) # test computation graph cg = ComputationGraph(cost) # DROPOUT L_endo_dropout = L_endo_dropout_conv_layers + L_endo_dropout_full_layers cg_dropout = cg inputs = VariableFilter(roles=[INPUT])(cg.variables) for (index, drop_rate) in enumerate(L_endo_dropout): for input_ in inputs: m = re.match(r"layer_(\d+)_apply.*", input_.name) if m and index == int(m.group(1)): if drop_rate < 0.0001: print "Skipped applying dropout on %s because the dropout rate was under 0.0001." % input_.name break else: cg_dropout = apply_dropout(cg, [input_], drop_rate) print "Applied dropout %f on %s." % (drop_rate, input_.name) break cg = cg_dropout return (cg, error_rate, cost, D_params, D_kind)
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') #attention ---> patch_shape = (16, 16) image_shape = (784, 100) import numpy import theano.tensor as T n_spatial_dims = 2 cropper = SoftRectangularCropper(n_spatial_dims=n_spatial_dims, patch_shape=patch_shape, image_shape=image_shape, kernel=Gaussian()) batch_size = 10 scales = 1.3**numpy.arange(-7, 6) n_patches = len(scales) locations = (numpy.ones( (n_patches, batch_size, 2)) * image_shape / 2).astype(numpy.float32) scales = numpy.tile(scales[:, numpy.newaxis, numpy.newaxis], (1, batch_size, 2)).astype(numpy.float32) Tpatches = T.stack(*[ cropper.apply(x, T.constant(location), T.constant(scale))[0] for location, scale in zip(locations, scales) ]) patches = theano.function([x], Tpatches)(batch['features']) import ipdb as pdb pdb.set_trace() probs = mlp.apply(tensor.flatten(patches, outdim=2)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run()
statistics_list=[(M1,S1,a1), (M2,S2,a2), (M3,S3,a3), (M4,S4,a4), (M5,S5,a5)] # initialize_variables # for variable (M,S) in variables: # compute M and S in the whole data. if normalization == 'bn2': for m,s,var in statistics_list: var.tag.aggregation_scheme = MeanAndVariance(var, var.shape[0], axis = 0) init_mn, init_var = DatasetEvaluator([var]).evaluate(stream_train)[var.name] m.set_value(init_mn.astype(floatX)) s.set_value(sqrt(init_var).astype(floatX)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) cost.name = 'cost' error_rate = MisclassificationRate().apply(y.flatten(), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) parameters = cg.parameters # add gradient descent to M,S if normalization == 'bn2': for m,s,var in statistics_list: parameters.extend([m,s]) algorithm = GradientDescent( cost=cost, parameters=parameters, step_rule=Adam(0.01)) #update the M and S with batch statistics alpha = 0.1
def train(self, X, Y, idx_folds, hyper_params, model_prefix, verbose=False): import os from collections import OrderedDict from fuel.datasets import IndexableDataset from blocks.model import Model from blocks.bricks import Linear, Softmax from blocks.bricks.conv import MaxPooling from blocks.initialization import Uniform from deepthought.bricks.cost import HingeLoss import numpy as np import theano from theano import tensor assert model_prefix is not None fold_weights_filename = '{}_weights.npy'.format(model_prefix) # convert Y to one-hot encoding n_classes = len(set(Y)) Y = np.eye(n_classes, dtype=int)[Y] features = tensor.matrix('features', dtype=theano.config.floatX) targets = tensor.lmatrix('targets') input_ = features dim = X.shape[-1] # optional additional layers if self.pipeline_factory is not None: # need to re-shape flattened input to restore bc01 format input_shape = (input_.shape[0],) + hyper_params['classifier_input_shape'] # tuple, uses actual batch size input_ = input_.reshape(input_shape) pipeline = self.pipeline_factory.build_pipeline(input_shape, hyper_params) input_ = pipeline.apply(input_) input_ = input_.flatten(ndim=2) # this is very hacky, but there seems to be no elegant way to obtain a value for dim dummy_fn = theano.function(inputs=[features], outputs=input_) dummy_out = dummy_fn(X[:1]) dim = dummy_out.shape[-1] if hyper_params['classifier_pool_width'] > 1: # FIXME: this is probably broken! # c = hyper_params['num_components'] # input_ = input_.reshape((input_.shape[0], c, input_.shape[-1] // c, 1)) # restore bc01 # need to re-shape flattened input to restore bc01 format input_shape = hyper_params['classifier_pool_input_shape'] # tuple input_ = input_.reshape(input_shape) pool = MaxPooling(name='pool', input_dim=input_shape[1:], # (c, X.shape[-1] // c, 1), pooling_size=(hyper_params['classifier_pool_width'], 1), step=(hyper_params['classifier_pool_stride'], 1)) input_ = pool.apply(input_) input_ = input_.reshape((input_.shape[0], tensor.prod(input_.shape[1:]))) dim = np.prod(pool.get_dim('output')) linear = Linear(name='linear', input_dim=dim, output_dim=n_classes, weights_init=Uniform(mean=0, std=0.01), use_bias=False) linear.initialize() softmax = Softmax('softmax') probs = softmax.apply(linear.apply(input_)) prediction = tensor.argmax(probs, axis=1) model = Model(probs) # classifier with raw probability outputs predict = theano.function([features], prediction) # ready-to-use predict function if os.path.isfile(fold_weights_filename): # load filter weights from existing file fold_weights = np.load(fold_weights_filename) print 'loaded filter weights from', fold_weights_filename else: # train model from blocks.bricks.cost import MisclassificationRate from blocks.filter import VariableFilter from blocks.graph import ComputationGraph from blocks.roles import WEIGHT from blocks.bricks import Softmax from blocks.model import Model from blocks.algorithms import GradientDescent, Adam from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.predicates import OnLogRecord from fuel.streams import DataStream from fuel.schemes import SequentialScheme, ShuffledScheme from blocks.monitoring import aggregation from blocks.main_loop import MainLoop from blocks.extensions.training import TrackTheBest from deepthought.extensions.parameters import BestParams # from deepthought.datasets.selection import DatasetMetaDB init_param_values = model.get_parameter_values() cost = HingeLoss().apply(targets, probs) # Note: this requires just the class labels, not in a one-hot encoding error_rate = MisclassificationRate().apply(targets.argmax(axis=1), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) # L1 regularization if hyper_params['classifier_l1wdecay'] > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + hyper_params['classifier_l1wdecay'] * sum([abs(W).sum() for W in weights]) cost.name = 'cost' # iterate over trial folds fold_weights = [] fold_errors = [] # for ifi, ifold in fold_generator.get_inner_cv_folds(outer_fold): # # train_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['train']) # valid_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['valid']) # # metadb = DatasetMetaDB(meta, train_selectors.keys()) # # # get selected trial IDs # train_idx = metadb.select(train_selectors) # valid_idx = metadb.select(valid_selectors) for train_idx, valid_idx in idx_folds: # print train_idx # print valid_idx trainset = IndexableDataset(indexables=OrderedDict( [('features', X[train_idx]), ('targets', Y[train_idx])])) validset = IndexableDataset(indexables=OrderedDict( [('features', X[valid_idx]), ('targets', Y[valid_idx])])) model.set_parameter_values(init_param_values) best_params = BestParams() best_params.add_condition(['after_epoch'], predicate=OnLogRecord('error_rate_valid_best_so_far')) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) extensions = [Timing(), FinishAfter(after_n_epochs=hyper_params['classifier_max_epochs']), DataStreamMonitoring( [cost, error_rate], DataStream.default_stream( validset, iteration_scheme=SequentialScheme( validset.num_examples, hyper_params['classifier_batch_size'])), suffix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], suffix="train", after_epoch=True), TrackTheBest('error_rate_valid'), best_params # after TrackTheBest! ] if verbose: extensions.append(Printing()) # optional extensions.append(ProgressBar()) main_loop = MainLoop( algorithm, DataStream.default_stream( trainset, iteration_scheme=ShuffledScheme(trainset.num_examples, hyper_params['classifier_batch_size'])), model=model, extensions=extensions) main_loop.run() fold_weights.append(best_params.values['/linear.W']) fold_errors.append(main_loop.status['best_error_rate_valid']) # break # FIXME fold_errors = np.asarray(fold_errors).squeeze() print 'simple NN fold classification errors:', fold_errors fold_weights = np.asarray(fold_weights) # store filter weights for later analysis np.save(fold_weights_filename, fold_weights) weights = fold_weights.mean(axis=0) linear.parameters[0].set_value(weights) return model, predict
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = SimpleRecurrent( dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=Adam() ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) def sampling_step(g_noise, states, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states = rnn.apply(inputs=embedding_step, states=states, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_samples [_, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def build_and_run(label, config): ############## CREATE THE NETWORK ############### #Define the parameters num_epochs, num_batches, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation = config['num_epochs'], config['num_batches'], config['num_channels'], config['image_shape'], config['filter_size'], config['num_filter'], config['pooling_sizes'], config['mlp_hiddens'], config['output_size'], config['batch_size'], config['activation'], config['mlp_activation'] # print(num_epochs, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation) lambda_l1 = 0.000025 lambda_l2 = 0.000025 print("Building model") #Create the symbolics variable x = T.tensor4('image_features') y = T.lmatrix('targets') #Get the parameters conv_parameters = zip(filter_size, num_filter) #Create the convolutions layers conv_layers = list(interleave([(Convolutional( filter_size=filter_size, num_filters=num_filter, name='conv_{}'.format(i)) for i, (filter_size, num_filter) in enumerate(conv_parameters)), (activation), (MaxPooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) # (AveragePooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) #Create the sequence conv_sequence = ConvolutionalSequence(conv_layers, num_channels, image_size=image_shape, weights_init=Uniform(width=0.2), biases_init=Constant(0.)) #Initialize the convnet conv_sequence.initialize() #Add the MLP top_mlp_dims = [np.prod(conv_sequence.get_dim('output'))] + mlp_hiddens + [output_size] out = Flattener().apply(conv_sequence.apply(x)) mlp = MLP(mlp_activation, top_mlp_dims, weights_init=Uniform(0, 0.2), biases_init=Constant(0.)) #Initialisze the MLP mlp.initialize() #Get the output predict = mlp.apply(out) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') ########### REGULARIZATION ################## cg = ComputationGraph([cost]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) biases = VariableFilter(roles=[BIAS])(cg.variables) # # l2_penalty_weights = T.sum([i*lambda_l2/len(weights) * (W ** 2).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer l2_penalty = T.sum([lambda_l2 * (W ** 2).sum() for i,W in enumerate(weights+biases)]) # Gradually increase penalty for layer # # #l2_penalty_bias = T.sum([lambda_l2*(B **2).sum() for B in biases]) # # #l2_penalty = l2_penalty_weights + l2_penalty_bias l2_penalty.name = 'l2_penalty' l1_penalty = T.sum([lambda_l1*T.abs_(z).sum() for z in weights+biases]) # l1_penalty_weights = T.sum([i*lambda_l1/len(weights) * T.abs_(W).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer # l1_penalty_biases = T.sum([lambda_l1 * T.abs_(B).sum() for B in biases]) # l1_penalty = l1_penalty_biases + l1_penalty_weights l1_penalty.name = 'l1_penalty' costreg = cost + l2_penalty + l1_penalty costreg.name = 'costreg' ########### DEFINE THE ALGORITHM ############# # algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum()) algorithm = GradientDescent(cost=costreg, parameters=cg.parameters, step_rule=Adam()) ########### GET THE DATA ##################### istest = 'test' in config.keys() train_stream, valid_stream, test_stream = get_stream(batch_size,image_shape,test=istest) ########### INITIALIZING EXTENSIONS ########## checkpoint = Checkpoint('models/best_'+label+'.tar') checkpoint.add_condition(['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Adding a live plot with the bokeh server plot = Plot(label, channels=[['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], # ['train_costreg','train_grad_norm']], # ['train_costreg','train_total_gradient_norm','train_l2_penalty','train_l1_penalty']], server_url="http://hades.calculquebec.ca:5042") grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = 'grad_norm' extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring([cost, error_rate, error_rate2], valid_stream, prefix="valid"), TrainingDataMonitoring([costreg, error_rate, error_rate2, grad_norm,l2_penalty,l1_penalty], prefix="train", after_epoch=True), plot, ProgressBar(), Printing(), TrackTheBest('valid_error_rate',min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=4)] # Early-stopping model = Model(cost) main_loop = MainLoop(algorithm,data_stream=train_stream,model=model,extensions=extensions) main_loop.run()
if mode is "CPU_test": data_train_stream = create_data(DogsVsCats(('train',), subset=slice(0, 100))) data_valid_stream = create_data(DogsVsCats(('train',), subset=slice(100, 110))) if mode is "GPU_run": data_train_stream = create_data(DogsVsCats(('train',), subset=slice(0, 22500))) data_valid_stream = create_data(DogsVsCats(('train',), subset=slice(22500, 25000))) if mode is "data_server": data_train_stream = ServerDataStream(('image_features','targets'), False, port=5560) data_valid_stream = ServerDataStream(('image_features','targets'), False, port=5561) ### Setting up the model probs = top_mlp.apply(conv_out) cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), probs) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) ### Gradient Descent algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learning_rate)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate, error_rate2], data_valid_stream, prefix="valid"), TrainingDataMonitoring( [cost, error_rate,
def train(train_set, test_set, l2_weight=1e-18): x = tensor.matrix('features') y = tensor.lmatrix('targets') n_classifiers = 3 n_classes = 2 l1 = Linear( name='l1', input_dim=2, output_dim=10, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l1.initialize() h1 = Logistic().apply(l1.apply(x)) l2 = Linear( name='l1', input_dim=l1.output_dim, output_dim=n_classes * n_classifiers, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l2.initialize() l2 = l2.apply(h1) y_hat = MultiTargetSoftmax().apply(l2, n_classes, n_classifiers) cost = MultiTargetCategoricalCrossEntropy().apply(y, y_hat) error = MisclassificationRate().apply(y, y_hat) error.name = 'misclassification_rate' cg = ComputationGraph(cost) for w in VariableFilter(roles=[WEIGHT])(cg.variables): cost += l2_weight * (w ** 2).sum() cost.name = 'cost_with_regularization' # print('W1', W1.get_value()) # print('W2', W2.get_value()) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=RMSProp() ) data_stream_train = Flatten( DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, batch_size=80) ) ) data_stream_test = Flatten( DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size=1) ) ) monitor = DataStreamMonitoring( variables=[cost, error], data_stream=data_stream_test, prefix="test" ) main_loop = MainLoop( data_stream=data_stream_train, algorithm=algorithm, extensions=[ monitor, FinishAfter(after_n_epochs=100), Printing(), # ProgressBar() ] ) main_loop.run() return x, y_hat
def main(save_to, num_epochs, flag, ksize): batch_size = 128 dim = 100 n_steps = 20 i2h1 = MLP([Identity()], [784, dim], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) h2o1 = MLP([Rectifier(), Softmax()], [dim, dim, 10], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) rec1 = SimpleRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) i2h1.initialize() h2o1.initialize() rec1.initialize() x = tensor.tensor3('features') y = tensor.lmatrix('targets') preproc = i2h1.apply(x) h1 = rec1.apply(preproc) probs = tensor.flatten(h2o1.apply(h1[-1],), outdim=2) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cost.name = 'final_cost' error_rate.name = 'error_rate' cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST("train", subset=slice(0, 50000)) mnist_valid = MNIST("train", subset=slice(50000, 60000)) mnist_test = MNIST("test") trainstream = Mapping(Flatten(DataStream(mnist_train, iteration_scheme=SequentialScheme(50000, batch_size))), _meanize(n_steps, flag, ksize)) validstream = Mapping(Flatten(DataStream(mnist_valid, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps, flag, ksize)) teststream = Mapping(Flatten(DataStream(mnist_test, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps, flag, ksize)) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([Adam(), StepClipping(100)])) main_loop = MainLoop( algorithm, trainstream, extensions=[Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], validstream, prefix="test"), DataStreamMonitoringAndSaving( [cost, error_rate], teststream, [i2h1, h2o1, rec1], 'best_'+save_to+'.pkl', cost_name=error_rate.name, after_epoch=True, prefix='valid' ), TrainingDataMonitoring( [cost, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), # Plot( # save_to, # channels=[ # ['test_final_cost', # 'test_misclassificationrate_apply_error_rate'], # ['train_total_gradient_norm']]), Printing()]) main_loop.run()
def main(): print("Build the network") input_of_image = tensor.matrix('features') input_to_hidden = Linear(name='input_to_hidden', input_dim=784, output_dim=100) h = Tanh().apply(input_to_hidden.apply(input_of_image)) hidden_to_output = Linear(name='hidden_to_output', input_dim=100, output_dim=10) output_hat = Softmax().apply(hidden_to_output.apply(h)) output = tensor.lmatrix('targets') cost = CategoricalCrossEntropy().apply(output.flatten(), output_hat) correct_rate = 1 - MisclassificationRate().apply(output.flatten(), output_hat) correct_rate.name = 'correct_rate' print(type(correct_rate)) cost.name = 'cost' cg = ComputationGraph(cost) # Initialize the parameters input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian( 0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # Train print("Prepare the data.") mnist_train = MNIST("train") mnist_test = MNIST("test") ## Carve the data into lots of batches. data_stream_train = DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size=256)) ## Set the algorithm for the training. algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule( [Scale(0.9), StepSwitcher(0.05, 0.1)])) ## Add a monitor extension for the training. data_stream_test = DataStream( mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, batch_size=1024)) test_monitor = DataStreamMonitoring(variables=[cost, correct_rate], data_stream=data_stream_test, prefix="test", after_every_epoch=True) train_monitor = TrainingDataMonitoring( variables=[cost, correct_rate, algorithm.total_step_norm], prefix='train', after_every_batch=True) ## Add a plot monitor. plot = Plot(document='new', channels=[['train_correct_rate', 'test_correct_rate']], start_server=True, after_every_batch=True) print("Start training") main_loop = MainLoop(algorithm=algorithm, data_stream=data_stream_train, extensions=[ plot, test_monitor, train_monitor, FinishAfter(after_n_epochs=20), Printing() ]) main_loop.run()