def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5("features") tensor3 = theano.tensor.TensorType(config.floatX, (False,) * 3) locs = tensor3("locs") # shape: B x Classes target = T.ivector("targets") model = LSTMAttention(configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, alpha, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply( input_, locs ) model.location = location model.scale = scale model.alpha = location model.patch = patch classifier = MLP( [Rectifier(), Softmax()], configs["classifier_dims"], weights_init=Glorot(), biases_init=Constant(0) ) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) cost.name = "CE" error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = "ER" model.cost = cost model.error_rate = error_rate model.probabilities = probabilities if configs["load_pretrained"]: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open("VGG_CNN_params.npz") as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs["test_model"]: print "TESTING THE MODEL: CHECK THE INPUT SIZE!" cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input="ignore", allow_input_downcast=True) data = configs["get_streams"](configs["batch_size"])[0].get_epoch_iterator().next() f(data[1], data[0], data[2]) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5('features') # shape: B x Classes target = T.lmatrix('targets') model = LSTMAttention( configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply(input_) classifier = MLP( [Rectifier(), Logistic()], configs['classifier_dims'], weights_init=Glorot(), biases_init=Constant(0)) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = BinaryCrossEntropy().apply(target, probabilities) cost.name = 'CE' error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = 'ER' model.cost = cost if configs['load_pretrained']: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open('VGG_CNN_params.npz') as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs['test_model']: cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input='ignore', allow_input_downcast=True) data = np.random.randn(10, 40, 3, 224, 224) targs = np.random.randn(40, 101) f(data, targs) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
def pretrain(model, hyper_params, full_hdf5, full_meta, train_selectors, valid_selectors=None): """ generic training method for siamese networks; works with any network structure :return: """ from theano import tensor from blocks.bricks.cost import MisclassificationRate from deepthought.datasets.triplet import TripletsIndexDataset from deepthought.bricks.cost import HingeLoss train_data = TripletsIndexDataset( full_hdf5, full_meta, train_selectors, targets_source=hyper_params['pretrain_target_source'], group_attribute=hyper_params['group_attribute']) if valid_selectors is not None: if hyper_params['use_ext_dataset_for_validation']: ext_selectors = train_selectors else: ext_selectors = None valid_data = TripletsIndexDataset( full_hdf5, full_meta, valid_selectors, ext_selectors=ext_selectors, targets_source=hyper_params['pretrain_target_source'], group_attribute=hyper_params['group_attribute']) else: valid_data = None # Note: this has to match the sources defined in the dataset #y = tensor.lvector('targets') y = tensor.lmatrix('targets') # Note: this requires a one-hot encoding of the targets probs = model.outputs[0] cost = HingeLoss().apply(y, probs) # Note: this requires just the class labels, not in a one-hot encoding error_rate = MisclassificationRate().apply(y.argmax(axis=1), probs) error_rate.name = 'error_rate' return GenericNNEncoderExperiment.run_pretrain(model, hyper_params, cost, train_data, valid_data, [error_rate])
def main(name, epochs, batch_size, learning_rate, window_size, conv_sizes, num_filters, fc_dim, enc_dim, dec_dim, step, num_digits, num_classes, oldmodel, live_plotting): channels, img_height, img_width = 1, 100, 100 rnninits = { 'weights_init': Uniform(width=0.02), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } rec_inits = { 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } convinits = { 'weights_init': Uniform(width=.2), 'biases_init': Constant(0.), } n_iter = step * num_digits filter_size1, filter_size2 = zip(conv_sizes, conv_sizes)[:] w_height, w_width = window_size.split(',') w_height = int(w_height) w_width = int(w_width) subdir = time.strftime("%Y-%m-%d") + "-" + name if not os.path.exists(subdir): os.makedirs(subdir) lines = ["\n Running experiment", " subdirectory: %s" % subdir, " learning rate: %g" % learning_rate, " attention size: %s" % window_size, " n_iterations: %d" % n_iter, " encoder dimension: %d" % enc_dim, " decoder dimension: %d" % dec_dim, " batch size: %d" % batch_size, " epochs: %d" % epochs, ] for line in lines: print(line) print() rectifier = Rectifier() conv1 = Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 2), num_channels=channels, image_size=(w_height, w_width), border_mode='half', name='conv1', **convinits) conv1_bn = SpatialBatchNormalization(input_dim=(64, 26, 26), conserve_memory=False, n_iter=n_iter, name='conv1_bn') conv2 = Convolutional(filter_size=filter_size2, num_channels=int(num_filters / 2), num_filters=int(num_filters / 2), image_size=(26, 26), name='conv2', **convinits) conv2_bn = SpatialBatchNormalization(input_dim=(64, 24, 24), conserve_memory=False, n_iter=n_iter, name='conv2_bn') max_pooling = MaxPooling(pooling_size=(2, 2), step=(2, 2)) conv3 = Convolutional(filter_size=filter_size2, num_filters=num_filters, num_channels=int(num_filters / 2), image_size=(12, 12), border_mode='half', name='conv3', **convinits) conv3_bn = SpatialBatchNormalization(input_dim=(128, 12, 12), conserve_memory=False, n_iter=n_iter, name='conv3_bn') conv4 = Convolutional(filter_size=filter_size2, num_filters=num_filters, num_channels=num_filters, image_size=(12, 12), border_mode='half', name='conv4', **convinits) conv4_bn = SpatialBatchNormalization(input_dim=(128, 12, 12), conserve_memory=False, n_iter=n_iter, name='conv4_bn') # Max Pooling conv5 = Convolutional(filter_size=filter_size2, num_filters=160, num_channels=num_filters, image_size=(6, 6), border_mode='half', name='conv5', **convinits) conv5_bn = SpatialBatchNormalization(input_dim=(160, 6, 6), conserve_memory=False, n_iter=n_iter, name='conv5_bn') conv6 = Convolutional(filter_size=filter_size2, num_filters=192, num_channels=160, image_size=(6, 6), name='conv6', **convinits) conv6_bn = SpatialBatchNormalization(input_dim=(192, 4, 4), conserve_memory=False, n_iter=n_iter, name='conv6_bn') conv_mlp = MLP(activations=[Identity()], dims=[3072, fc_dim], name="MLP_conv", **inits) conv_mlp_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='conv_mlp_bn') loc_mlp = MLP(activations=[Identity()], dims=[6, fc_dim], name="MLP_loc", **inits) loc_mlp_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='loc_mlp_bn') encoder_mlp = MLP([Identity()], [fc_dim, 4 * enc_dim], name="MLP_enc", **rec_inits) decoder_mlp = MLP([Identity()], [enc_dim, 4 * dec_dim], name="MLP_dec", **rec_inits) encoder_rnn = LSTM(activation=Tanh(), dim=enc_dim, name="RNN_enc", **rnninits) conv_init = ConvolutionalSequence( [Convolutional(filter_size=filter_size1, num_filters=int(num_filters / 8), name='conv1_init'), SpatialBatchNormalization(conserve_memory=False, name='conv1_bn_init'), Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 8), name='conv2_init'), SpatialBatchNormalization(conserve_memory=False, name='conv2_bn_init'), Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 4), name='conv3_init'), SpatialBatchNormalization(conserve_memory=False, name='conv3_bn_init'), ], image_size=(12, 12), num_channels=channels, name='conv_seq_init', **convinits) decoder_rnn = LSTM(activation=Tanh(), dim=dec_dim, name="RNN_dec", **rnninits) emit_mlp = MLP(activations=[Tanh()], dims=[dec_dim, 6], name='emit_mlp', weights_init=Constant(0.), biases_init=Constant((1., 0., 0., 0., 1., 0.))) classification_mlp1 = MLP(activations=[Identity()], dims=[enc_dim, fc_dim], name='MPL_class1', **inits) classification_mlp1_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='classification_mlp1_bn') classification_mlp2 = MLP(activations=[Identity()], dims=[fc_dim, fc_dim], name='MPL_class2', **inits) classification_mlp2_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='classification_mlp2_bn') classification_mlp3 = MLP(activations=[Softmax()], dims=[fc_dim, num_classes], name='MPL_class3', **inits) edram = EDRAM(channels=channels, out_height=w_height, out_width=w_width, n_iter=n_iter, num_classes=num_classes, rectifier=rectifier, conv1=conv1, conv1_bn=conv1_bn, conv2=conv2, conv2_bn=conv2_bn, max_pooling=max_pooling, conv3=conv3, conv3_bn=conv3_bn, conv4=conv4, conv4_bn=conv4_bn, conv5=conv5, conv5_bn=conv5_bn, conv6=conv6, conv6_bn=conv6_bn, conv_mlp=conv_mlp, conv_mlp_bn=conv_mlp_bn, loc_mlp=loc_mlp, loc_mlp_bn=loc_mlp_bn, conv_init=conv_init, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, classification_mlp1=classification_mlp1, classification_mlp1_bn=classification_mlp1_bn, classification_mlp2=classification_mlp2, classification_mlp2_bn=classification_mlp2_bn, classification_mlp3=classification_mlp3, emit_mlp=emit_mlp) edram.initialize() # ------------------------------------------------------------------------ x = T.ftensor4('features') x_coarse = T.ftensor4('features_coarse') y = T.ivector('labels') wr = T.fmatrix('locations') with batch_normalization(edram): bn_p, bn_l, m_c1_bn, s_c1_bn, m_c2_bn, s_c2_bn, m_c3_bn, s_c3_bn, m_c4_bn, s_c4_bn, m_c5_bn, s_c5_bn, m_c6_bn, s_c6_bn, \ m_c_bn, s_c_bn, m_l_bn, s_l_bn, m_cl1_bn, s_cl1_bn, m_cl2_bn, s_cl2_bn = edram.calculate_train(x, x_coarse) def compute_cost(p, wr, y, l): cost_where = T.dot(T.sqr(wr - l), [1, 0.5, 1, 0.5, 1, 1]) cost_y = T.stack([T.nnet.categorical_crossentropy(T.maximum(p[i, :], 1e-7), y) for i in range(0, n_iter)]) return cost_where, cost_y cost_where, cost_y = compute_cost(bn_p, wr, y, bn_l) bn_cost = cost_y + cost_where bn_cost = bn_cost.sum(axis=0) bn_cost = bn_cost.mean() bn_cost.name = 'cost' bn_error_rate = MisclassificationRate().apply(y, bn_p[-1]) bn_error_rate.name = 'error_rate' # ------------------------------------------------------------ bn_cg = ComputationGraph([bn_cost, bn_error_rate]) # Prepare algorithm algorithm = GradientDescent( cost=bn_cg.outputs[0], on_unused_sources='ignore', parameters=bn_cg.parameters, step_rule=CompositeRule([ RemoveNotFinite(), StepClipping(10.), Adam(learning_rate) ]) ) pop_updates = get_batch_normalization_updates(bn_cg) update_params = [conv1_bn.population_mean, conv1_bn.population_stdev, conv2_bn.population_mean, conv2_bn.population_stdev, conv3_bn.population_mean, conv3_bn.population_stdev, conv4_bn.population_mean, conv4_bn.population_stdev, conv5_bn.population_mean, conv5_bn.population_stdev, conv6_bn.population_mean, conv6_bn.population_stdev, conv_mlp_bn.population_mean, conv_mlp_bn.population_stdev, loc_mlp_bn.population_mean, loc_mlp_bn.population_stdev, classification_mlp1_bn.population_mean, classification_mlp1_bn.population_stdev, classification_mlp2_bn.population_mean, classification_mlp2_bn.population_stdev] update_values = [m_c1_bn, s_c1_bn, m_c2_bn, s_c2_bn, m_c3_bn, s_c3_bn, m_c4_bn, s_c4_bn, m_c5_bn, s_c5_bn, m_c6_bn, s_c6_bn, m_c_bn, s_c_bn, m_l_bn, s_l_bn, m_cl1_bn, s_cl1_bn, m_cl2_bn, s_cl2_bn] pop_updates.extend([(p, m) for p, m in zip(update_params, update_values)]) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] algorithm.add_updates(extra_updates) # ------------------------------------------------------------------------ # Setup monitors p, l = edram.calculate_test(x, x_coarse) cost_where, cost_y = compute_cost(p, wr, y, l) cost = cost_y + cost_where cost = cost.sum(axis=0) cost = cost.mean() cost.name = 'cost' error_rate = MisclassificationRate().apply(y, p[-1]) error_rate.name = 'error_rate' monitors = [cost, error_rate] plotting_extensions = [] # Live plotting... if live_plotting: plot_channels = [ ['train_cost', 'test_cost'], ['train_error_rate', 'test_error_rate'], ] plotting_extensions = [ Plot(subdir, channels=plot_channels, server_url='http://155.69.150.60:80/') ] # ------------------------------------------------------------ mnist_cluttered_train = MNISTCluttered(which_sets=['train'], sources=('features', 'locations', 'labels')) mnist_cluttered_test = MNISTCluttered(which_sets=['test'], sources=('features', 'locations', 'labels')) main_loop = MainLoop( model=Model([bn_cost]), data_stream=DataStream.default_stream(mnist_cluttered_train, iteration_scheme=ShuffledScheme(mnist_cluttered_train.num_examples, batch_size)), algorithm=algorithm, extensions=[Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring(monitors, DataStream.default_stream(mnist_cluttered_train, iteration_scheme=SequentialScheme(mnist_cluttered_train.num_examples, batch_size)), prefix='train'), DataStreamMonitoring(monitors, DataStream.default_stream(mnist_cluttered_test, iteration_scheme=SequentialScheme(mnist_cluttered_test.num_examples, batch_size)), prefix="test"), PartsOnlyCheckpoint("{}/{}".format(subdir, name), before_training=False, after_epoch=True, save_separately=['log', ]), TrackTheBest('test_error_rate', 'best_test_error_rate'), BestCheckpount("{}/{}".format(subdir, name), 'best_test_error_rate', save_separately=['model', ]), Printing(), ProgressBar(), PrintingTo("\n".join(lines), "{}/{}_log.txt".format(subdir, name)), ] + plotting_extensions) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_parameter_values(oldmodel.get_parameter_values()) main_loop.model.get_top_bricks()[0].conv1_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv1_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv1_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv1_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv2_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv2_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv2_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv2_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv3_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv3_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv3_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv3_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv4_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv4_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv4_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv4_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv5_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv5_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv5_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv5_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv6_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv6_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv6_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv6_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].loc_mlp_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].loc_mlp_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].loc_mlp_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].loc_mlp_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_mlp_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv_mlp_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_mlp_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv_mlp_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp1_bn.population_mean.set_value( oldmodel.get_top_bricks()[0].classification_mlp1_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp1_bn.population_stdev.set_value( oldmodel.get_top_bricks()[0].classification_mlp1_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp2_bn.population_mean.set_value( oldmodel.get_top_bricks()[0].classification_mlp2_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp2_bn.population_stdev.set_value( oldmodel.get_top_bricks()[0].classification_mlp2_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[1].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[1].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[1].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[1].population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[3].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[3].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[3].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[3].population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[5].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[5].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[5].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[5].population_stdev.get_value()) del oldmodel main_loop.run()
prototype=input_mlp, ) parallel_nets.initialize() l_h, r_h = parallel_nets.apply(l_x=l_x, r_x=r_x) # Concatenate the inputs from the two hidden subnets into a single variable # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate: # Also apply the outputs from the net, and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # Need to define the computation graph: graph = ComputationGraph(cost) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(graph.variables) # Add some regularization to this model: lam = 0.001 cost += lam * l2_norm(W) cost.name = 'entropy' # This is the model without dropout, but with l2 reg. model = Model(cost)
def train_net(net, train_stream, test_stream, L1 = None, L2=None, early_stopping=False, finish=None, dropout=False, jobid=None, update=None, duration= None, **ignored): x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') y_hat = net.apply(x) #Cost cost_before = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost_before.name = "cost_without_regularization" #Error #Taken from brodesf error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = "Misclassification rate" #Regularization cg = ComputationGraph(cost_before) WS = VariableFilter(roles=[WEIGHT])(cg.variables) if dropout: print("Dropout") cg = apply_dropout(cg, WS, 0.5) if L1: print("L1 with lambda ",L1) L1_reg = L1 * sum([abs(W).sum() for W in WS]) L1_reg.name = "L1 regularization" cost_before += L1_reg if L2: print("L2 with lambda ",L2) L2_reg = L2 * sum([(W ** 2).sum() for W in WS]) L2_reg.name = "L2 regularization" cost_before += L2_reg cost = cost_before cost.name = 'cost_with_regularization' #Initialization print("Initilization") net.initialize() #Algorithm step_rule = Scale(learning_rate=0.1) if update is not None: if update == "rmsprop": print("Using RMSProp") step_rule = RMSProp() remove_not_finite = RemoveNotFinite(0.9) step_rule = CompositeRule([step_rule, remove_not_finite]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) print("Extensions") extensions = [] #Monitoring monitor = DataStreamMonitoring(variables=[cost, error], data_stream=test_stream, prefix="test") extensions.append(monitor) def filename(suffix=""): prefix = jobid if jobid else str(os.getpid()) ctime = str(time.time()) return "checkpoints/" + prefix + "_" + ctime + "_" + suffix + ".zip" #Serialization #serialization = Checkpoint(filename()) #extensions.append(serialization) notification = "test_"+error.name track = TrackTheBest(notification) best_notification = track.notification_name checkpointbest = SaveBest(best_notification, filename("best")) extensions.extend([track, checkpointbest]) if early_stopping: print("Early stopping") stopper = FinishIfNoImprovementAfterPlus(best_notification) extensions.append(stopper) #Other extensions if finish != None: print("Force finish ", finish) extensions.append(FinishAfter(after_n_epochs=finish)) if duration != None: print("Stop after " , duration, " seconds") extensions.append(FinishAfterTime(duration)) extensions.extend([ Timing(), Printing() ]) #Main loop main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) print("Main loop start") main_loop.run()
def train_net(net, train_stream, test_stream, L1=None, L2=None, early_stopping=False, finish=None, dropout=False, jobid=None, update=None, duration=None, **ignored): x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') y_hat = net.apply(x) #Cost cost_before = CategoricalCrossEntropy().apply(y.flatten(), y_hat) cost_before.name = "cost_without_regularization" #Error #Taken from brodesf error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = "Misclassification rate" #Regularization cg = ComputationGraph(cost_before) WS = VariableFilter(roles=[WEIGHT])(cg.variables) if dropout: print("Dropout") cg = apply_dropout(cg, WS, 0.5) if L1: print("L1 with lambda ", L1) L1_reg = L1 * sum([abs(W).sum() for W in WS]) L1_reg.name = "L1 regularization" cost_before += L1_reg if L2: print("L2 with lambda ", L2) L2_reg = L2 * sum([(W**2).sum() for W in WS]) L2_reg.name = "L2 regularization" cost_before += L2_reg cost = cost_before cost.name = 'cost_with_regularization' #Initialization print("Initilization") net.initialize() #Algorithm step_rule = Scale(learning_rate=0.1) if update is not None: if update == "rmsprop": print("Using RMSProp") step_rule = RMSProp() remove_not_finite = RemoveNotFinite(0.9) step_rule = CompositeRule([step_rule, remove_not_finite]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) print("Extensions") extensions = [] #Monitoring monitor = DataStreamMonitoring(variables=[cost, error], data_stream=test_stream, prefix="test") extensions.append(monitor) def filename(suffix=""): prefix = jobid if jobid else str(os.getpid()) ctime = str(time.time()) return "checkpoints/" + prefix + "_" + ctime + "_" + suffix + ".zip" #Serialization #serialization = Checkpoint(filename()) #extensions.append(serialization) notification = "test_" + error.name track = TrackTheBest(notification) best_notification = track.notification_name checkpointbest = SaveBest(best_notification, filename("best")) extensions.extend([track, checkpointbest]) if early_stopping: print("Early stopping") stopper = FinishIfNoImprovementAfterPlus(best_notification) extensions.append(stopper) #Other extensions if finish != None: print("Force finish ", finish) extensions.append(FinishAfter(after_n_epochs=finish)) if duration != None: print("Stop after ", duration, " seconds") extensions.append(FinishAfterTime(duration)) extensions.extend([Timing(), Printing()]) #Main loop main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions) print("Main loop start") main_loop.run()
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) hidden_units = int(config.get('hyperparams', 'hidden_units', 16)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') fine_tune = config.getboolean('hyperparams', 'fine_tune') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13' ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45' right_dim = 10519 left_dim = 11427 train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') l_x = tensor.matrix('l_features') r_x = tensor.matrix('r_features') y = tensor.lmatrix('targets') lnet = load(ln_file).model.get_top_bricks()[0] rnet = load(rn_file).model.get_top_bricks()[0] # Pre-trained layers: # Inputs -> hidden_1 -> hidden 2 for side, net in zip(['l', 'r'], [lnet, rnet]): for child in net.children: child.name = side + '_' + child.name ll1 = lnet.children[0] lr1 = lnet.children[1] ll2 = lnet.children[2] lr2 = lnet.children[3] rl1 = rnet.children[0] rr1 = rnet.children[1] rl2 = rnet.children[2] rr2 = rnet.children[3] l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x)))) r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x)))) input_dim = ll2.output_dim + rl2.output_dim # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output output_mlp = MLP(activations=[ Rectifier(name='h3'), Rectifier(name='h4'), Softmax(name='output'), ], dims=[ input_dim, hidden_units, hidden_units, 2, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(std=W_sd, mean=W_mu)) output_mlp.initialize() # # Concatenate the inputs from the two hidden subnets into a single variable # # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) # y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # If no fine-tuning of l-r models is wanted, find the params for only # the joint layers: if fine_tune: params_to_update = dropout_graph.parameters else: params_to_update = VariableFilter([PARAMETER], bricks=output_mlp.children)(cost_graph) # Learning Algorithm: algo = GradientDescent( step_rule=solver_type, params=params_to_update, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_LeftRight', channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], ) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}'.format(stamp), save_separately=['model', 'log'], every_n_epochs=1) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), FinishIfNoImprovementAfter(notification_name='validation_error', epochs=1), Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
mlp.biases_init = Constant(0.0) mlp.initialize() lin = Linear(200, 10, use_bias=True) lin.weights_init = Uniform(0.0, 0.01) lin.biases_init = Constant(0.0) lin.initialize() train_out = lin.apply(mlp.apply(flat_x)) test_out = lin.apply(mlp.apply(flat_x)) sm = Softmax(name='softmax') loss = sm.categorical_cross_entropy(flat_y, train_out).mean() loss.name = 'nll' misclass = MisclassificationRate().apply(flat_y, train_out) misclass.name = 'misclass' test_loss = sm.categorical_cross_entropy(flat_y, test_out).mean() test_loss.name = 'nll' test_misclass = MisclassificationRate().apply(flat_y, test_out) test_misclass.name = 'misclass' model = Model(loss) ###################### # Data ###################### import numpy #from mnist import MNIST from fuel.datasets.mnist import MNIST from fuel.transformers import ScaleAndShift, ForceFloatX
def main(job_id, params, config_file='params.ec'): config = ConfigParser.ConfigParser() config.readfp(open('./configs/{}'.format(config_file))) pr = pprint.PrettyPrinter(indent=4) pr.pprint(config) net_name = config.get('hyperparams', 'net_name', 'adni') struct_name = net_name.split('_')[0] max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float(config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') input_dim = input_dims[struct_name] # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) else: solver_type = CompositeRule([RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm)]) data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[ input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [input for input in inputs if input.name.startswith('linear_')] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent( step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream( dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm)], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream( dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring( variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme( test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring( variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('{}_{}'.format(net_name, side), channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}/{}/{}'.format(struct_name, side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit: # And by that I mean if the means of the val error and training error over the # previous 'epochs' is greater than the 'threshold', we are overfitting. early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.05, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop( data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def main(num_epochs=50, batch_normalized=True, alpha=0.1): """Run the example. Parameters ---------- num_epochs : int, optional Number of epochs for which to train. batch_normalized : bool, optional Batch-normalize the training graph. Defaults to `True`. alpha : float, optional Weight to apply to a new sample when calculating running averages for population statistics (1 - alpha weight is given to the existing average). """ if batch_normalized: # Add an extra keyword argument that only BatchNormalizedMLP takes, # in order to speed things up at the cost of a bit of extra memory. mlp_class = BatchNormalizedMLP extra_kwargs = {'conserve_memory': False} else: mlp_class = MLP extra_kwargs = {} mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()], [2, 5, 5, 5, 3], weights_init=IsotropicGaussian(0.2), biases_init=Constant(0.), **extra_kwargs) mlp.initialize() # Generate a dataset with 3 spiral arms, using 8000 examples for # training and 2000 for testing. dataset = Spiral(num_examples=10000, classes=3, sources=['features', 'label'], noise=0.05) train_stream = DataStream(dataset, iteration_scheme=ShuffledScheme(examples=8000, batch_size=20)) test_stream = DataStream(dataset, iteration_scheme=SequentialScheme( examples=list(range(8000, 10000)), batch_size=2000)) # Build a cost graph; this contains BatchNormalization bricks that will # by default run in inference mode. features = tensor.matrix('features') label = tensor.lvector('label') prediction = mlp.apply(features) cost = CategoricalCrossEntropy().apply(label, prediction) misclass = MisclassificationRate().apply(label, prediction) misclass.name = 'misclass' # The default name for this is annoyingly long original_cg = ComputationGraph([cost, misclass]) if batch_normalized: cg = apply_batch_normalization(original_cg) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] else: cg = original_cg extra_updates = [] algorithm = GradientDescent(step_rule=Adam(0.001), cost=cg.outputs[0], parameters=cg.parameters) algorithm.add_updates(extra_updates) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, # Use the original cost and misclass variables so # that we monitor the (original) inference-mode graph. extensions=[ DataStreamMonitoring([cost, misclass], train_stream, prefix='train'), DataStreamMonitoring([cost, misclass], test_stream, prefix='test'), Printing(), FinishAfter(after_n_epochs=num_epochs) ]) main_loop.run() return main_loop
from blocks.bricks.cost import MisclassificationRate x = tensor.matrix('features') y = tensor.lmatrix('targets') lin1 = Linear(name='lin1', input_dim=126, output_dim=50, weights_init=Constant(0.005), biases_init=Constant(0)) act1_sigmoid = Logistic().apply(lin1.apply(x)) lin2 = Linear(name='lin2', input_dim=50, output_dim=2, weights_init=Constant(0.001), biases_init=Constant(0)) act2_softmax = Softmax().apply(lin2.apply(act1_sigmoid)) lin1.initialize() lin2.initialize() missclass = MisclassificationRate().apply(y.argmax(axis=1), act2_softmax) missclass.name = 'missclassification' cost = CategoricalCrossEntropy().apply(y, act2_softmax) comp_graph = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(comp_graph.variables) cost = cost + 0.005 * (W1**2).sum() + 0.005 * (W2**2).sum() cost.name = 'cost' from blocks.algorithms import GradientDescent, Scale from blocks.extensions import FinishAfter, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring from fuel.transformers import Flatten from fuel.streams import DataStream
def train(self, X, Y, idx_folds, hyper_params, model_prefix, verbose=False): import os from collections import OrderedDict from fuel.datasets import IndexableDataset from blocks.model import Model from blocks.bricks import Linear, Softmax from blocks.bricks.conv import MaxPooling from blocks.initialization import Uniform from deepthought.bricks.cost import HingeLoss import numpy as np import theano from theano import tensor assert model_prefix is not None fold_weights_filename = '{}_weights.npy'.format(model_prefix) # convert Y to one-hot encoding n_classes = len(set(Y)) Y = np.eye(n_classes, dtype=int)[Y] features = tensor.matrix('features', dtype=theano.config.floatX) targets = tensor.lmatrix('targets') input_ = features dim = X.shape[-1] # optional additional layers if self.pipeline_factory is not None: # need to re-shape flattened input to restore bc01 format input_shape = (input_.shape[0],) + hyper_params['classifier_input_shape'] # tuple, uses actual batch size input_ = input_.reshape(input_shape) pipeline = self.pipeline_factory.build_pipeline(input_shape, hyper_params) input_ = pipeline.apply(input_) input_ = input_.flatten(ndim=2) # this is very hacky, but there seems to be no elegant way to obtain a value for dim dummy_fn = theano.function(inputs=[features], outputs=input_) dummy_out = dummy_fn(X[:1]) dim = dummy_out.shape[-1] if hyper_params['classifier_pool_width'] > 1: # FIXME: this is probably broken! # c = hyper_params['num_components'] # input_ = input_.reshape((input_.shape[0], c, input_.shape[-1] // c, 1)) # restore bc01 # need to re-shape flattened input to restore bc01 format input_shape = hyper_params['classifier_pool_input_shape'] # tuple input_ = input_.reshape(input_shape) pool = MaxPooling(name='pool', input_dim=input_shape[1:], # (c, X.shape[-1] // c, 1), pooling_size=(hyper_params['classifier_pool_width'], 1), step=(hyper_params['classifier_pool_stride'], 1)) input_ = pool.apply(input_) input_ = input_.reshape((input_.shape[0], tensor.prod(input_.shape[1:]))) dim = np.prod(pool.get_dim('output')) linear = Linear(name='linear', input_dim=dim, output_dim=n_classes, weights_init=Uniform(mean=0, std=0.01), use_bias=False) linear.initialize() softmax = Softmax('softmax') probs = softmax.apply(linear.apply(input_)) prediction = tensor.argmax(probs, axis=1) model = Model(probs) # classifier with raw probability outputs predict = theano.function([features], prediction) # ready-to-use predict function if os.path.isfile(fold_weights_filename): # load filter weights from existing file fold_weights = np.load(fold_weights_filename) print 'loaded filter weights from', fold_weights_filename else: # train model from blocks.bricks.cost import MisclassificationRate from blocks.filter import VariableFilter from blocks.graph import ComputationGraph from blocks.roles import WEIGHT from blocks.bricks import Softmax from blocks.model import Model from blocks.algorithms import GradientDescent, Adam from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.predicates import OnLogRecord from fuel.streams import DataStream from fuel.schemes import SequentialScheme, ShuffledScheme from blocks.monitoring import aggregation from blocks.main_loop import MainLoop from blocks.extensions.training import TrackTheBest from deepthought.extensions.parameters import BestParams # from deepthought.datasets.selection import DatasetMetaDB init_param_values = model.get_parameter_values() cost = HingeLoss().apply(targets, probs) # Note: this requires just the class labels, not in a one-hot encoding error_rate = MisclassificationRate().apply(targets.argmax(axis=1), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) # L1 regularization if hyper_params['classifier_l1wdecay'] > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + hyper_params['classifier_l1wdecay'] * sum([abs(W).sum() for W in weights]) cost.name = 'cost' # iterate over trial folds fold_weights = [] fold_errors = [] # for ifi, ifold in fold_generator.get_inner_cv_folds(outer_fold): # # train_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['train']) # valid_selectors = fold_generator.get_fold_selectors(outer_fold=outer_fold, inner_fold=ifold['valid']) # # metadb = DatasetMetaDB(meta, train_selectors.keys()) # # # get selected trial IDs # train_idx = metadb.select(train_selectors) # valid_idx = metadb.select(valid_selectors) for train_idx, valid_idx in idx_folds: # print train_idx # print valid_idx trainset = IndexableDataset(indexables=OrderedDict( [('features', X[train_idx]), ('targets', Y[train_idx])])) validset = IndexableDataset(indexables=OrderedDict( [('features', X[valid_idx]), ('targets', Y[valid_idx])])) model.set_parameter_values(init_param_values) best_params = BestParams() best_params.add_condition(['after_epoch'], predicate=OnLogRecord('error_rate_valid_best_so_far')) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) extensions = [Timing(), FinishAfter(after_n_epochs=hyper_params['classifier_max_epochs']), DataStreamMonitoring( [cost, error_rate], DataStream.default_stream( validset, iteration_scheme=SequentialScheme( validset.num_examples, hyper_params['classifier_batch_size'])), suffix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], suffix="train", after_epoch=True), TrackTheBest('error_rate_valid'), best_params # after TrackTheBest! ] if verbose: extensions.append(Printing()) # optional extensions.append(ProgressBar()) main_loop = MainLoop( algorithm, DataStream.default_stream( trainset, iteration_scheme=ShuffledScheme(trainset.num_examples, hyper_params['classifier_batch_size'])), model=model, extensions=extensions) main_loop.run() fold_weights.append(best_params.values['/linear.W']) fold_errors.append(main_loop.status['best_error_rate_valid']) # break # FIXME fold_errors = np.asarray(fold_errors).squeeze() print 'simple NN fold classification errors:', fold_errors fold_weights = np.asarray(fold_weights) # store filter weights for later analysis np.save(fold_weights_filename, fold_weights) weights = fold_weights.mean(axis=0) linear.parameters[0].set_value(weights) return model, predict
def train(train_set, test_set, l2_weight=1e-18): x = tensor.matrix('features') y = tensor.lmatrix('targets') n_classifiers = 3 n_classes = 2 l1 = Linear( name='l1', input_dim=2, output_dim=10, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l1.initialize() h1 = Logistic().apply(l1.apply(x)) l2 = Linear( name='l1', input_dim=l1.output_dim, output_dim=n_classes * n_classifiers, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l2.initialize() l2 = l2.apply(h1) y_hat = MultiTargetSoftmax().apply(l2, n_classes, n_classifiers) cost = MultiTargetCategoricalCrossEntropy().apply(y, y_hat) error = MisclassificationRate().apply(y, y_hat) error.name = 'misclassification_rate' cg = ComputationGraph(cost) for w in VariableFilter(roles=[WEIGHT])(cg.variables): cost += l2_weight * (w ** 2).sum() cost.name = 'cost_with_regularization' # print('W1', W1.get_value()) # print('W2', W2.get_value()) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=RMSProp() ) data_stream_train = Flatten( DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, batch_size=80) ) ) data_stream_test = Flatten( DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size=1) ) ) monitor = DataStreamMonitoring( variables=[cost, error], data_stream=data_stream_test, prefix="test" ) main_loop = MainLoop( data_stream=data_stream_train, algorithm=algorithm, extensions=[ monitor, FinishAfter(after_n_epochs=100), Printing(), # ProgressBar() ] ) main_loop.run() return x, y_hat
def main(num_epochs=50, batch_normalized=True, alpha=0.1): """Run the example. Parameters ---------- num_epochs : int, optional Number of epochs for which to train. batch_normalized : bool, optional Batch-normalize the training graph. Defaults to `True`. alpha : float, optional Weight to apply to a new sample when calculating running averages for population statistics (1 - alpha weight is given to the existing average). """ if batch_normalized: # Add an extra keyword argument that only BatchNormalizedMLP takes, # in order to speed things up at the cost of a bit of extra memory. mlp_class = BatchNormalizedMLP extra_kwargs = {'conserve_memory': False} else: mlp_class = MLP extra_kwargs = {} mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()], [2, 5, 5, 5, 3], weights_init=IsotropicGaussian(0.2), biases_init=Constant(0.), **extra_kwargs) mlp.initialize() # Generate a dataset with 3 spiral arms, using 8000 examples for # training and 2000 for testing. dataset = Spiral(num_examples=10000, classes=3, sources=['features', 'label'], noise=0.05) train_stream = DataStream(dataset, iteration_scheme=ShuffledScheme(examples=8000, batch_size=20)) test_stream = DataStream(dataset, iteration_scheme=SequentialScheme( examples=list(range(8000, 10000)), batch_size=2000)) # Build a cost graph; this contains BatchNormalization bricks that will # by default run in inference mode. features = tensor.matrix('features') label = tensor.lvector('label') prediction = mlp.apply(features) cost = CategoricalCrossEntropy().apply(label, prediction) misclass = MisclassificationRate().apply(label, prediction) misclass.name = 'misclass' # The default name for this is annoyingly long original_cg = ComputationGraph([cost, misclass]) if batch_normalized: cg = apply_batch_normalization(original_cg) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] else: cg = original_cg extra_updates = [] algorithm = GradientDescent(step_rule=Adam(0.001), cost=cg.outputs[0], parameters=cg.parameters) algorithm.add_updates(extra_updates) main_loop = MainLoop(algorithm=algorithm, data_stream=train_stream, # Use the original cost and misclass variables so # that we monitor the (original) inference-mode graph. extensions=[DataStreamMonitoring([cost, misclass], train_stream, prefix='train'), DataStreamMonitoring([cost, misclass], test_stream, prefix='test'), Printing(), FinishAfter(after_n_epochs=num_epochs)]) main_loop.run() return main_loop
def main(save_to, cost_name, learning_rate, momentum, num_epochs): mlp = MLP([None], [784, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') scores = mlp.apply(x) batch_size = y.shape[0] indices = tensor.arange(y.shape[0]) target_scores = tensor.set_subtensor( tensor.zeros((batch_size, 10))[indices, y.flatten()], 1) score_diff = scores - target_scores # Logistic Regression if cost_name == 'lr': cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean() # MSE elif cost_name == 'mse': cost = (score_diff ** 2).mean() # Perceptron elif cost_name == 'perceptron': cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean() # TLE elif cost_name == 'minmin': cost = abs(score_diff[indices, y.flatten()]).mean() cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean() # TLEcut elif cost_name == 'minmin_cut': # Score of the groundtruth should be greater or equal than its target score cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean() # Score of the prediction should be less or equal than its actual score cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean() # TLE2 elif cost_name == 'minmin2': cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()]) ** 2).mean() cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)]) ** 2).mean() # Direct loss minimization elif cost_name == 'direct': epsilon = 0.1 cost = (- scores[indices, (scores + epsilon * target_scores).argmax(axis=1)] + scores[indices, scores.argmax(axis=1)]).mean() cost /= epsilon elif cost_name == 'svm': cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] - scores[indices, y.flatten()]).mean() else: raise ValueError("Unknown cost " + cost) error_rate = MisclassificationRate().apply(y.flatten(), scores) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) cost.name = 'cost' mnist_train = MNIST(("train",)) mnist_test = MNIST(("test",)) if learning_rate == None: learning_rate = 0.0001 if momentum == None: momentum = 0.0 rule = Momentum(learning_rate=learning_rate, momentum=momentum) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=rule) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], Flatten( DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features',)), prefix="test"), # CallbackExtension( # lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9), # after_epoch=True), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate], prefix="train", after_epoch=True), Checkpoint(save_to), Printing()] if BLOCKS_EXTRAS_AVAILABLE: extensions.append(Plot( 'MNIST example', channels=[ ['test_cost', 'test_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop( algorithm, Flatten( DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features',)), model=Model(cost), extensions=extensions) main_loop.run() df = pandas.DataFrame.from_dict(main_loop.log, orient='index') res = {'cost' : cost_name, 'learning_rate' : learning_rate, 'momentum' : momentum, 'train_cost' : df.train_cost.iloc[-1], 'test_cost' : df.test_cost.iloc[-1], 'best_test_cost' : df.test_cost.min(), 'train_error' : df.train_error_rate.iloc[-1], 'test_error' : df.test_error_rate.iloc[-1], 'best_test_error' : df.test_error_rate.min()} res = {k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items()} json.dump(res, sys.stdout) sys.stdout.flush()
def main(save_to, cost_name, learning_rate, momentum, num_epochs): mlp = MLP([None], [784, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') scores = mlp.apply(x) batch_size = y.shape[0] indices = tensor.arange(y.shape[0]) target_scores = tensor.set_subtensor( tensor.zeros((batch_size, 10))[indices, y.flatten()], 1) score_diff = scores - target_scores # Logistic Regression if cost_name == 'lr': cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean() # MSE elif cost_name == 'mse': cost = (score_diff**2).mean() # Perceptron elif cost_name == 'perceptron': cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean() # TLE elif cost_name == 'minmin': cost = abs(score_diff[indices, y.flatten()]).mean() cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean() # TLEcut elif cost_name == 'minmin_cut': # Score of the groundtruth should be greater or equal than its target score cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean() # Score of the prediction should be less or equal than its actual score cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean() # TLE2 elif cost_name == 'minmin2': cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()])**2).mean() cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)])**2).mean() # Direct loss minimization elif cost_name == 'direct': epsilon = 0.1 cost = (-scores[indices, (scores + epsilon * target_scores).argmax(axis=1)] + scores[indices, scores.argmax(axis=1)]).mean() cost /= epsilon elif cost_name == 'svm': cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] - scores[indices, y.flatten()]).mean() else: raise ValueError("Unknown cost " + cost) error_rate = MisclassificationRate().apply(y.flatten(), scores) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) cost.name = 'cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) if learning_rate == None: learning_rate = 0.0001 if momentum == None: momentum = 0.0 rule = Momentum(learning_rate=learning_rate, momentum=momentum) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=rule) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), # CallbackExtension( # lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9), # after_epoch=True), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[['test_cost', 'test_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run() df = pandas.DataFrame.from_dict(main_loop.log, orient='index') res = { 'cost': cost_name, 'learning_rate': learning_rate, 'momentum': momentum, 'train_cost': df.train_cost.iloc[-1], 'test_cost': df.test_cost.iloc[-1], 'best_test_cost': df.test_cost.min(), 'train_error': df.train_error_rate.iloc[-1], 'test_error': df.test_error_rate.iloc[-1], 'best_test_error': df.test_error_rate.min() } res = { k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items() } json.dump(res, sys.stdout) sys.stdout.flush()
# initialize_variables # for variable (M,S) in variables: # compute M and S in the whole data. if normalization == 'bn2': for m,s,var in statistics_list: var.tag.aggregation_scheme = MeanAndVariance(var, var.shape[0], axis = 0) init_mn, init_var = DatasetEvaluator([var]).evaluate(stream_train)[var.name] m.set_value(init_mn.astype(floatX)) s.set_value(sqrt(init_var).astype(floatX)) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) cost.name = 'cost' error_rate = MisclassificationRate().apply(y.flatten(), probs) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) parameters = cg.parameters # add gradient descent to M,S if normalization == 'bn2': for m,s,var in statistics_list: parameters.extend([m,s]) algorithm = GradientDescent( cost=cost, parameters=parameters, step_rule=Adam(0.01)) #update the M and S with batch statistics alpha = 0.1 updates = []
def setup_model(configs): tensor5 = theano.tensor.TensorType(config.floatX, (False,) * 5) # shape: T x B x C x X x Y input_ = tensor5('features') tensor3 = theano.tensor.TensorType(config.floatX, (False,) * 3) locs = tensor3('locs') # shape: B x Classes target = T.ivector('targets') model = LSTMAttention( configs, weights_init=Glorot(), biases_init=Constant(0)) model.initialize() (h, c, location, scale, alpha, patch, downn_sampled_input, conved_part_1, conved_part_2, pre_lstm) = model.apply(input_, locs) model.location = location model.scale = scale model.alpha = location model.patch = patch classifier = MLP( [Rectifier(), Softmax()], configs['classifier_dims'], weights_init=Glorot(), biases_init=Constant(0)) classifier.initialize() probabilities = classifier.apply(h[-1]) cost = CategoricalCrossEntropy().apply(target, probabilities) cost.name = 'CE' error_rate = MisclassificationRate().apply(target, probabilities) error_rate.name = 'ER' model.cost = cost model.error_rate = error_rate model.probabilities = probabilities if configs['load_pretrained']: blocks_model = Model(model.cost) all_params = blocks_model.parameters with open('VGG_CNN_params.npz') as f: loaded = np.load(f) all_conv_params = loaded.keys() for param in all_params: if param.name in loaded.keys(): assert param.get_value().shape == loaded[param.name].shape param.set_value(loaded[param.name]) all_conv_params.pop(all_conv_params.index(param.name)) print "the following parameters did not match: " + str(all_conv_params) if configs['test_model']: print "TESTING THE MODEL: CHECK THE INPUT SIZE!" cg = ComputationGraph(model.cost) f = theano.function(cg.inputs, [model.cost], on_unused_input='ignore', allow_input_downcast=True) data = configs['get_streams'](configs[ 'batch_size'])[0].get_epoch_iterator().next() f(data[1], data[0], data[2]) print "Test passed! ;)" model.monitorings = [cost, error_rate] return model
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = SimpleRecurrent( dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=Adam() ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) def sampling_step(g_noise, states, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states = rnn.apply(inputs=embedding_step, states=states, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_samples [_, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float( config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427} data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent(step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_{}'.format(side), channels=[ ['dropout_entropy', 'validation_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}net/{}'.format(side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.1, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
from blocks.bricks import Linear, Logistic, Softmax # In[10]: hidden_layer_size = 100 input_to_hidden = Linear(name='input_to_hidden', input_dim=117, output_dim=hidden_layer_size) h = Logistic().apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_layer_size, output_dim=2) y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix('targets') from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate cost = CategoricalCrossEntropy().apply(y, y_hat) error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) error_rate.name = "error_rate" # >>> from blocks.roles import WEIGHT from blocks.graph import ComputationGraph # >>> from blocks.filter import VariableFilter cg = ComputationGraph(cost) # >>> W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) # >>> cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() # >>> cost.name = 'cost_with_regularization' cost.name = 'cost_simple_xentropy' from blocks.initialization import IsotropicGaussian, Constant input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian(0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize()
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) hidden_units = int(config.get('hyperparams', 'hidden_units', 16)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') fine_tune = config.getboolean('hyperparams', 'fine_tune') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) rn_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/rnet/2015-06-25-18:13' ln_file = '/projects/francisco/repositories/NI-ML/models/deepnets/blocks/ff/models/lnet/2015-06-29-11:45' right_dim = 10519 left_dim = 11427 train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') l_x = tensor.matrix('l_features') r_x = tensor.matrix('r_features') y = tensor.lmatrix('targets') lnet = load(ln_file).model.get_top_bricks()[0] rnet = load(rn_file).model.get_top_bricks()[0] # Pre-trained layers: # Inputs -> hidden_1 -> hidden 2 for side, net in zip(['l', 'r'], [lnet, rnet]): for child in net.children: child.name = side + '_' + child.name ll1 = lnet.children[0] lr1 = lnet.children[1] ll2 = lnet.children[2] lr2 = lnet.children[3] rl1 = rnet.children[0] rr1 = rnet.children[1] rl2 = rnet.children[2] rr2 = rnet.children[3] l_h = lr2.apply(ll2.apply(lr1.apply(ll1.apply(l_x)))) r_h = rr2.apply(rl2.apply(rr1.apply(rl1.apply(r_x)))) input_dim = ll2.output_dim + rl2.output_dim # hidden_2 -> hidden_3 -> hidden_4 -> Logistic output output_mlp = MLP(activations=[ Rectifier(name='h3'), Rectifier(name='h4'), Softmax(name='output'), ], dims=[ input_dim, hidden_units, hidden_units, 2, ], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(std=W_sd, mean=W_mu)) output_mlp.initialize() # # Concatenate the inputs from the two hidden subnets into a single variable # # for input into the next layer. merge = tensor.concatenate([l_h, r_h], axis=1) # y_hat = output_mlp.apply(merge) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], 0.2) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # If no fine-tuning of l-r models is wanted, find the params for only # the joint layers: if fine_tune: params_to_update = dropout_graph.parameters else: params_to_update = VariableFilter( [PARAMETER], bricks=output_mlp.children)(cost_graph) # Learning Algorithm: algo = GradientDescent(step_rule=solver_type, params=params_to_update, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot( 'AdniNet_LeftRight', channels=[ ['dropout_entropy'], ['error', 'validation_error'], ], ) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}'.format(stamp), save_separately=['model', 'log'], every_n_epochs=1) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), FinishIfNoImprovementAfter( notification_name='validation_error', epochs=1), Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
def train_paired_dnn(train_x, train_y, dev_x, dev_y, test_x, test_y): train_y = train_y.flatten().astype(int) dev_y = dev_y.flatten().astype(int) test_y = test_y.flatten().astype(int) batch_size = 256 n_train, in_dim = train_x.shape n_dev = dev_x.shape[0] n_test = test_x.shape[0] hid_dims = 2 * np.array([512, 512, 512, 512]) out_dim = 1 ds_train = make_ds(train_x, train_y, batch_size, n_train, SequentialScheme) ds_dev = make_ds(dev_x, dev_y, batch_size, n_dev, SequentialScheme) ds_test = make_ds(test_x, test_y, batch_size, n_test, SequentialScheme) mlp = MLP( activations=[Rectifier(), Rectifier(), Rectifier(), Rectifier(), Logistic()], dims=[in_dim, hid_dims[0], hid_dims[1], hid_dims[2], hid_dims[3], out_dim], weights_init=Uniform(mean=0, width=1/32), biases_init=Constant(0) ) mlp.initialize() x = tensor.matrix('features') y = tensor.matrix('targets', dtype='int64') y_hat = mlp.apply(x) model = Model(y_hat) cost = MyBinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' misrate = MisclassificationRate().apply(y.flatten(), y_hat) misrate.name = 'misclassfication' cg = ComputationGraph([cost, misrate]) drop_vars = VariableFilter( roles=[INPUT], bricks=mlp.linear_transformations[1:] )(cg.variables) cg_dropout = apply_dropout(cg, drop_vars, 0.2) cost_dropout, error_rate_dropout = cg_dropout.outputs learning_rate = 0.0015 momentum = 0.9 step_rule = CompositeRule([ Momentum(learning_rate=learning_rate, momentum=momentum), AdaGrad(learning_rate=learning_rate) ]) algorithm = GradientDescent(cost=cost_dropout, parameters=cg.parameters, step_rule=step_rule) monitor_train = TrainingDataMonitoring( variables=[cost_dropout, error_rate_dropout, aggregation.mean(algorithm.total_gradient_norm)], after_epoch=True, prefix="train" ) monitor_dev = DataStreamMonitoring( # variables=[cost_dropout, error_rate_dropout], variables=[cost, misrate], data_stream=ds_dev, prefix="dev" ) monitor_test = DataStreamMonitoring( # variables=[cost_dropout, error_rate_dropout], variables=[cost, misrate], data_stream=ds_test, prefix="test" ) track_str = 'train_{0}'.format(cost_dropout.name) track_best_str = '{0}_best_so_far'.format(track_str) print track_str, track_best_str n_epochs = 2 print 'n_epochs:', n_epochs main_loop = MainLoop( model=model, data_stream=ds_train, algorithm=algorithm, extensions=[Timing(), monitor_train, monitor_dev, monitor_test, TrackTheBest(track_str), Checkpoint("best_model.pkl", use_cpickle = True ).add_condition(['after_epoch'], predicate=OnLogRecord(track_best_str)), FinishAfter(after_n_epochs=n_epochs), # FinishIfNoImprovementAfter(track_best_str, epochs=n_epochs), Printing()] ) main_loop.run() acc([x], y_hat, train_x, train_y, 'train') acc([x], y_hat, dev_x, dev_y, 'dev') acc([x], y_hat, test_x, test_y, 'test')
hidden_layer_size = 100 input_to_hidden = Linear(name='input_to_hidden', input_dim=117, output_dim=hidden_layer_size) h = Logistic().apply(input_to_hidden.apply(x)) hidden_to_output = Linear(name='hidden_to_output', input_dim=hidden_layer_size, output_dim=2) y_hat = Softmax().apply(hidden_to_output.apply(h)) y = tensor.lmatrix('targets') from blocks.bricks.cost import CategoricalCrossEntropy, MisclassificationRate cost = CategoricalCrossEntropy().apply(y, y_hat) error_rate = MisclassificationRate().apply(y.argmax(axis=1), y_hat) error_rate.name = "error_rate" # >>> from blocks.roles import WEIGHT from blocks.graph import ComputationGraph # >>> from blocks.filter import VariableFilter cg = ComputationGraph(cost) # >>> W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) # >>> cost = cost + 0.005 * (W1 ** 2).sum() + 0.005 * (W2 ** 2).sum() # >>> cost.name = 'cost_with_regularization' cost.name = 'cost_simple_xentropy' from blocks.initialization import IsotropicGaussian, Constant input_to_hidden.weights_init = hidden_to_output.weights_init = IsotropicGaussian( 0.01) input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize()
def main(save_to, num_epochs, flag, ksize): batch_size = 128 dim = 100 n_steps = 20 i2h1 = MLP([Identity()], [784, dim], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) h2o1 = MLP([Rectifier(), Softmax()], [dim, dim, 10], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) rec1 = SimpleRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) i2h1.initialize() h2o1.initialize() rec1.initialize() x = tensor.tensor3('features') y = tensor.lmatrix('targets') preproc = i2h1.apply(x) h1 = rec1.apply(preproc) probs = tensor.flatten(h2o1.apply(h1[-1],), outdim=2) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cost.name = 'final_cost' error_rate.name = 'error_rate' cg = ComputationGraph([cost, error_rate]) mnist_train = MNIST("train", subset=slice(0, 50000)) mnist_valid = MNIST("train", subset=slice(50000, 60000)) mnist_test = MNIST("test") trainstream = Mapping(Flatten(DataStream(mnist_train, iteration_scheme=SequentialScheme(50000, batch_size))), _meanize(n_steps, flag, ksize)) validstream = Mapping(Flatten(DataStream(mnist_valid, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps, flag, ksize)) teststream = Mapping(Flatten(DataStream(mnist_test, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps, flag, ksize)) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([Adam(), StepClipping(100)])) main_loop = MainLoop( algorithm, trainstream, extensions=[Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring( [cost, error_rate], validstream, prefix="test"), DataStreamMonitoringAndSaving( [cost, error_rate], teststream, [i2h1, h2o1, rec1], 'best_'+save_to+'.pkl', cost_name=error_rate.name, after_epoch=True, prefix='valid' ), TrainingDataMonitoring( [cost, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), # Plot( # save_to, # channels=[ # ['test_final_cost', # 'test_misclassificationrate_apply_error_rate'], # ['train_total_gradient_norm']]), Printing()]) main_loop.run()
def train(train_set, test_set): x = tensor.matrix('features') y = tensor.lmatrix('targets') l1 = Linear( name='input_to_hidden', input_dim=2, output_dim=3, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l1.initialize() h = Logistic().apply(l1.apply(x)) l2 = Linear( name='hidden_to_output', input_dim=l1.output_dim, output_dim=2, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0) ) l2.initialize() y_hat = Softmax().apply(l2.apply(h)) cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'misclassification_rate' cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 1e-8 * (W1 ** 2).sum() + 1e-8 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' print('W1', W1.get_value()) print('W2', W2.get_value()) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=RMSProp() ) data_stream_train = Flatten( DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, batch_size=4) ) ) data_stream_test = Flatten( DataStream.default_stream( test_set, iteration_scheme=SequentialScheme(test_set.num_examples, batch_size=1) ) ) monitor = DataStreamMonitoring( variables=[cost, error], data_stream=data_stream_test, prefix="test" ) main_loop = MainLoop( data_stream=data_stream_train, algorithm=algorithm, extensions=[ monitor, FinishAfter(after_n_epochs=100), Printing(), # ProgressBar() ] ) main_loop.run()