def run(discriminative_regularization=True): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs(discriminative_regularization) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp'))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring monitored_quantities_list = [] for graph in [bn_cg, cg]: cost, kl_term, reconstruction_term = graph.outputs cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term]) train_monitoring = DataStreamMonitoring( monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=False, every_n_epochs=5) valid_monitoring = DataStreamMonitoring( monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=False, every_n_epochs=5) # Prepare checkpoint save_path = 'celeba_vae_{}regularization.zip'.format( '' if discriminative_regularization else 'no_') checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=75), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def train_rnnrbm(train, rnnrbm, epochs=1000, test=None, bokeh=True, load_path=None): cdk = theano.shared(10) lr = theano.shared(float32(0.004)) cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk) error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask) mistake_rate.name = "single error within note" cost.name = 'rbm_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule( [RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite()]) # Scale(0.01) gradients = dict(equizip(cg.parameters, T.grad(cost, cg.parameters, consider_constant=[v_sample]))) algorithm = GradientDescent(step_rule=step_rule, gradients=gradients, cost=cost, params=cg.parameters) algorithm.add_updates(cg.updates) extensions = [ SharedVariableModifier(parameter=cdk, function=lambda n, v: rnnrbm_cdk[n] if rnnrbm_cdk.get(n) else v), SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.78 * v) if n % (200 * 5) == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar()] if test is not None: extensions.append(DataStreamMonitoring( [cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append(Plot( 'Training RNN-RBM', channels=[ ['train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note'], ['train_final_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions ) return main_loop
def test_gradient_descent_finds_inputs_additional_updates(): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) n = shared_floatx(1) m = tensor.scalar('m') algorithm = GradientDescent(gradients=OrderedDict([(W, W + 1)])) algorithm.add_updates([(n, n + m)]) algorithm.initialize() assert m in algorithm.inputs
def test_gradient_descent_finds_inputs_additional_updates(): W = shared_floatx(numpy.array([[1, 2], [3, 4]])) n = shared_floatx(1) m = tensor.scalar('m') algorithm = GradientDescent(gradients=OrderedDict([(W, W + 1)])) algorithm.add_updates([(n, n + m)]) algorithm.initialize() assert m in algorithm.inputs
def create_main_loop(dataset, nvis, nhid, num_epochs, debug_level=0, lrate=1e-3): seed = 188229 n_inference_steps = 6 num_examples = dataset.num_examples batch_size = num_examples train_loop_stream = Flatten( DataStream.default_stream( dataset=dataset, iteration_scheme=SequentialScheme(dataset.num_examples, batch_size) # Repeat( # , n_inference_steps) # ShuffledScheme(dataset.num_examples, batch_size), n_inference_steps)) ), which_sources=("features",), ) model_brick = FivEM( nvis=nvis, nhid=nhid, epsilon=0.01, batch_size=batch_size, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), noise_scaling=1, debug=debug_level, lateral_x=False, lateral_h=False, n_inference_steps=n_inference_steps, ) model_brick.initialize() x = tensor.matrix("features") cost = model_brick.cost(x) computation_graph = ComputationGraph([cost]) model = Model(cost) # step_rule = Adam(learning_rate=2e-5, beta1=0.1, beta2=0.001, epsilon=1e-8, # decay_factor=(1 - 1e-8)) step_rule = Momentum(learning_rate=lrate, momentum=0.95) # step_rule = AdaDelta() # step_rule = RMSProp(learning_rate=0.01) # step_rule = AdaGrad(learning_rate=1e-4) algorithm = GradientDescent(cost=cost, params=computation_graph.parameters, step_rule=step_rule) algorithm.add_updates(computation_graph.updates) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), TrainingDataMonitoring([cost] + computation_graph.auxiliary_variables, after_batch=False, after_epoch=True), # every_n_epochs=1), Printing(after_epoch=True, after_batch=False), # every_n_epochs=1, # Checkpoint(path="./fivem.zip",every_n_epochs=10,after_training=True) ] main_loop = MainLoop(model=model, data_stream=train_loop_stream, algorithm=algorithm, extensions=extensions) return main_loop
def run(): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=True) main_loop_stream = streams[0] train_monitor_stream = streams[1] valid_monitor_stream = streams[2] cg, bn_dropout_cg = create_training_computation_graphs() # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. pop_updates = get_batch_normalization_updates(bn_dropout_cg) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0], parameters=bn_dropout_cg.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring cost = bn_dropout_cg.outputs[0] cost.name = 'cost' train_monitoring = DataStreamMonitoring( [cost], train_monitor_stream, prefix="train", before_first_epoch=False, after_epoch=False, after_training=True, updates=extra_updates) cost, accuracy = cg.outputs cost.name = 'cost' accuracy.name = 'accuracy' monitored_quantities = [cost, accuracy] valid_monitoring = DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid", before_first_epoch=False, after_epoch=False, every_n_epochs=5) # Prepare checkpoint checkpoint = Checkpoint( 'celeba_classifier.zip', every_n_epochs=5, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=50), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def train_language_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed): c = config if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed data, lm, retrieval = initialize_data_and_model(config) # full main loop can be saved... main_loop_path = os.path.join(save_path, 'main_loop.tar') # or only state (log + params) which can be useful not to pickle embeddings state_path = os.path.join(save_path, 'training_state.tar') stream_path = os.path.join(save_path, 'stream.pkl') best_tar_path = os.path.join(save_path, "best_model.tar") words = tensor.ltensor3('words') words_mask = tensor.matrix('words_mask') if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4, max_length=5).get_epoch_iterator()) words.tag.test_value = test_value_data[0] words_mask.tag.test_value = test_value_data[1] costs, updates = lm.apply(words, words_mask) cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(words.shape[1], 'length') perplexity, = VariableFilter(name='perplexity')(cg) perplexities = VariableFilter(name_regex='perplexity.*')(cg) monitored_vars = [length, cost] + perplexities if c['dict_path']: num_definitions, = VariableFilter(name='num_definitions')(cg) monitored_vars.extend([num_definitions]) parameters = cg.get_parameter_dict() trained_parameters = parameters.values() saved_parameters = parameters.values() if c['embedding_path']: logger.debug("Exclude word embeddings from the trained parameters") trained_parameters = [ p for p in trained_parameters if not p == lm.get_def_embeddings_params() ] saved_parameters = [ p for p in saved_parameters if not p == lm.get_def_embeddings_params() ] if c['cache_size'] != 0: logger.debug("Enable fake recursivity for looking up embeddings") trained_parameters = [ p for p in trained_parameters if not p == lm.get_cache_params() ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) if c['cache_size'] != 0: algorithm.add_updates(updates) train_monitored_vars = list(monitored_vars) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg) main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg) train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS]) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) # We use a completely random seed on purpose. With Fuel server # it's currently not possible to restore the state of the training # stream. That's why it's probably better to just have it stateless. stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None training_stream = data.get_stream('train', batch_size=c['batch_size'], max_length=c['max_length'], seed=stream_seed) valid_stream = data.get_stream('valid', batch_size=c['batch_size_valid'], max_length=c['max_length'], seed=stream_seed) original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) validation = DataStreamMonitoring(monitored_vars, valid_stream, prefix="valid").set_conditions( before_first_epoch=not fast_start, on_resumption=True, every_n_batches=c['mon_freq_valid']) track_the_best = TrackTheBest(validation.record_name(perplexity), choose_best=min).set_conditions( on_resumption=True, after_epoch=True, every_n_batches=c['mon_freq_valid']) # don't save them the entire main loop to avoid pickling everything if c['fast_checkpoint']: load = (LoadNoUnpickling(state_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_main_loop': False, 'save_separately': ['log', 'iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(state_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches']: intermediate_cp = IntermediateCheckpoint( state_path, every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) else: load = (Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_separately': ['iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(main_loop_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches']: intermediate_cp = IntermediateCheckpoint( main_loop_path, every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) checkpoint = checkpoint.add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (best_tar_path, )) extensions = [ load, StartFuelServer(original_training_stream, stream_path, before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']) ] if retrieval: extensions.append( RetrievalPrintStats(retrieval=retrieval, every_n_batches=c['mon_freq_train'], before_training=not fast_start)) extensions.extend([ TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), validation, track_the_best, checkpoint ]) if c['checkpoint_every_n_batches']: extensions.append(intermediate_cp) extensions.extend([ DumpTensorflowSummaries(save_path, every_n_batches=c['mon_freq_train'], after_training=True), Printing(on_resumption=True, every_n_batches=c['mon_freq_train']), FinishIfNoImprovementAfter(track_the_best.notification_name, iterations=50 * c['mon_freq_valid'], every_n_batches=c['mon_freq_valid']), FinishAfter(after_n_batches=c['n_batches']) ]) logger.info("monitored variables during training:" + "\n" + pprint.pformat(train_monitored_vars, width=120)) logger.info("monitored variables during valid:" + "\n" + pprint.pformat(monitored_vars, width=120)) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
if normalization == 'bn2': for m,s,var in statistics_list: parameters.extend([m,s]) algorithm = GradientDescent( cost=cost, parameters=parameters, step_rule=Adam(0.01)) #update the M and S with batch statistics alpha = 0.1 updates = [] if normalization == 'bn2': for m,s,var in statistics_list: updates.append((m, cast(alpha*m + (1-alpha)*var.mean(axis=0), floatX))) updates.append((s, cast(alpha*s + (1-alpha)*var.std(axis=0) , floatX))) algorithm.add_updates(updates) # Since this line wont work with the extension to include parameters # in the gradient descent. Here's an extension that will do the job. from blocks.extensions import SimpleExtension from theano import function class UpdateExtraParams(SimpleExtension): """Adjusts shared variable parameter using some function. """ def __init__(self, input1, input2, updates, **kwargs): kwargs.setdefault("after_batch", True) super(UpdateExtraParams, self).__init__(**kwargs) self.updates = updates self.func = function([input1, input2],[], updates = updates,
def main(): nclasses = 27 import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=1) parser.add_argument("--length", type=int, default=180) parser.add_argument("--num-epochs", type=int, default=100) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=1e-3) parser.add_argument("--epsilon", type=float, default=1e-5) parser.add_argument("--num-hidden", type=int, default=1000) parser.add_argument("--baseline", action="store_true") parser.add_argument("--initialization", choices="identity glorot orthogonal uniform".split(), default="identity") parser.add_argument("--initial-gamma", type=float, default=1e-1) parser.add_argument("--initial-beta", type=float, default=0) parser.add_argument("--cluster", action="store_true") parser.add_argument("--activation", choices=list(activations.keys()), default="tanh") parser.add_argument("--optimizer", choices="sgdmomentum adam rmsprop", default="rmsprop") parser.add_argument("--continue-from") parser.add_argument("--evaluate") parser.add_argument("--dump-hiddens") args = parser.parse_args() np.random.seed(args.seed) blocks.config.config.default_seed = args.seed if args.continue_from: from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses) ### optimization algorithm definition if args.optimizer == "adam": optimizer = Adam(learning_rate=args.learning_rate) elif args.optimizer == "rmsprop": optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9) elif args.optimizer == "sgdmomentum": optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99) step_rule = CompositeRule([StepClipping(1.0), optimizer]) algorithm = GradientDescent( cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule ) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor step_channels = [] step_channels.extend( [ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ] ) step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm")) step_channels.extend(graphs["training"].outputs) logger.warning("constructing training data monitor") extensions.append(TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True)) # parameter monitor extensions.append( DataStreamMonitoring( [param.norm(2).copy(name="parameter.norm:%s" % name) for name, param in model.get_parameter_dict().items()], data_stream=None, after_epoch=True, ) ) validation_interval = 500 # performance monitor for situation in "training inference".split(): if situation == "inference" and not args.evaluate: # save time when we don't need the inference graph continue for which_set in "train valid test".split(): logger.warning("constructing %s %s monitor" % (which_set, situation)) channels = list(graphs[situation].outputs) extensions.append( DataStreamMonitoring( channels, prefix="%s_%s" % (which_set, situation), every_n_batches=validation_interval, data_stream=get_stream( which_set=which_set, batch_size=args.batch_size, num_examples=10000, length=args.length ), ) ) extensions.extend( [ TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"), DumpBest("best_valid_training_error_rate", "best.zip"), FinishAfter(after_n_epochs=args.num_epochs), # FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50), Checkpoint("checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True), DumpLog("log.pkl", after_epoch=True), ] ) if not args.cluster: extensions.append(ProgressBar()) extensions.extend([Timing(), Printing(every_n_batches=validation_interval), PrintingTo("log")]) main_loop = MainLoop( data_stream=get_stream(which_set="train", batch_size=args.batch_size, length=args.length, augment=True), algorithm=algorithm, extensions=extensions, model=model, ) if args.dump_hiddens: dump_hiddens(args, main_loop) return if args.evaluate: evaluate(args, main_loop) return main_loop.run()
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError( 'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)' ) z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, ) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 4, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d' % l, ogates_zoneout=ogates_zoneout) for l in range(num_layers) ] elif rnn_type.lower() == 'gru': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 3, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d' % l) for l in range(num_layers) ] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d' % l) for l in range(num_layers) ] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply( rnn_embedding, zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost / np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states' % l in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states' % l: norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x * x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [ cost_train, cost, bpc, perp, learning_rate, aggregation.mean( algorithm.total_gradient_norm).copy("gradient_norm_mean") ] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append( RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value() / lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError( 'Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError('z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)') z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream(IndexableDataset(indexables=OrderedDict([ ('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord( stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data',) * 3 + stream.sources + ('zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream,) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*4, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d'%l, ogates_zoneout=ogates_zoneout) for l in range(num_layers)] elif rnn_type.lower() == 'gru': in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*3, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d'%l) for l in range(num_layers)] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d'%l) for l in range(num_layers)] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply(rnn_embedding, zoneouts_states[:, :, l * layer_size : (l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size : (l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size : (l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply( y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost/np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt(T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states'%l in [o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states'%l: norms = _magnitude(output) norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy('cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x*x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [cost_train, cost, bpc, perp, learning_rate, aggregation.mean(algorithm.total_gradient_norm).copy("gradient_norm_mean")] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring( variables=observed_vars, prefix="train", after_epoch=True ) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring( variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates ) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend([FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates ) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append(SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append(RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value()/lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError('Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
def construct_main_loop(name, task_name, batch_size, max_epochs, patience_epochs, learning_rate, hyperparameters, **kwargs): task = tasks.get_task(**hyperparameters) hyperparameters["n_channels"] = task.n_channels extensions = [] print "constructing graphs..." graphs, outputs, updates = construct_graphs(task=task, **hyperparameters) print "setting up main loop..." from blocks.model import Model model = Model(outputs["train"]["cost"]) from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam algorithm = GradientDescent(cost=outputs["train"]["cost"], parameters=graphs["train"].parameters, step_rule=CompositeRule([ StepClipping(1e1), Adam(learning_rate=learning_rate), StepClipping(1e2) ]), on_unused_sources="warn") algorithm.add_updates(updates["train"]) extensions.extend( construct_monitors(algorithm=algorithm, task=task, model=model, graphs=graphs, outputs=outputs, **hyperparameters)) from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing from blocks.extensions.stopping import FinishIfNoImprovementAfter from blocks.extensions.training import TrackTheBest from blocks.extensions.saveload import Checkpoint from dump import DumpBest, LightCheckpoint, PrintingTo extensions.extend([ TrackTheBest("valid_error_rate", "best_valid_error_rate"), FinishIfNoImprovementAfter("best_valid_error_rate", epochs=patience_epochs), FinishAfter(after_n_epochs=max_epochs), DumpBest("best_valid_error_rate", name + "_best.zip"), Checkpoint(hyperparameters["checkpoint_save_path"], on_interrupt=False, every_n_epochs=5, before_training=True, use_cpickle=True), ProgressBar(), Timing(), Printing(), PrintingTo(name + "_log") ]) from blocks.main_loop import MainLoop main_loop = MainLoop(data_stream=task.get_stream("train"), algorithm=algorithm, extensions=extensions, model=model) # note blocks will crash and burn because it cannot deal with an # already-initialized Algorithm, so this should be enabled only for # debugging if False: with open("graph", "w") as graphfile: algorithm.initialize() theano.printing.debugprint(algorithm._function, file=graphfile) from tabulate import tabulate print "parameter sizes:" print tabulate( (key, "x".join(map(str, value.get_value().shape)), value.get_value().size) for key, value in main_loop.model.get_parameter_dict().items()) return main_loop
# param_nans = 0 # for i, p in enumerate(params): # # cost += reg * abs(p).sum() # cost += reg * (p ** 2).sum() # param_nans += T.isnan(p).sum() # name = params[i].name # params[i] = params[i].mean() # params[i].name = name + str(i) cost.name = "final_cost" # param_nans.name = 'params_nans' ## Training algorithm # algorithm.initialize() algorithm.add_updates(cg.updates) extensions = [ FinishAfter(after_n_epochs=5000), DataStreamMonitoring([cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test"), TrainingDataMonitoring( [ cost, # hidden_states, debug_val, param_nans, aggregation.mean(algorithm.total_gradient_norm) ], #+ params, prefix="train", after_epoch=True), Timing(),
def train(cli_params): cli_params['save_dir'] = prepare_dir(cli_params['save_to']) logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=False) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim, ) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, parameters=all_params, step_rule=Adam(learning_rate=ladder.lr)) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) short_prints = { "train": { 'T_C_class': ladder.costs.class_corr, 'T_C_de': ladder.costs.denois.values(), }, "valid_approx": OrderedDict([ ('V_C_class', ladder.costs.class_clean), ('V_E', ladder.error.clean), ('V_C_de', ladder.costs.denois.values()), ]), "valid_error": OrderedDict([ ('VE_C_class', ladder.costs.class_clean), ('VE_E', ladder.error.clean), ('VE_C_de', ladder.costs.denois.values()), ]), } main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm), model=Model(ladder.costs.total), extensions=[ #FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance #ApproxTestMonitoring( # [ladder.costs.class_clean, ladder.error.clean] # + ladder.costs.denois.values(), # make_datastream(data.valid, data.valid_ind, # p.valid_batch_size, whiten=whiten, cnorm=cnorm, # scheme=ShuffledScheme), # prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + ladder.costs.denois.values(), make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, n_labeled=len(data.valid_ind), whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_error", every_n_epochs=1), TrainingDataMonitoring([ ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm ] + ladder.costs.denois.values(), prefix="train", after_epoch=True), #SaveParams(None, all_params, p.save_dir, after_epoch=True), #SaveExpParams(p, p.save_dir, before_training=True), #SaveLog(p.save_dir, after_training=True), #ShortPrinting(short_prints), #ProgressBar(), StopAfterNoImprovementValidation('valid_error_error_rate_clean', 10, all_params, p, p.save_dir), #LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, # after_epoch=True), ]) main_loop.run() # Get results df = DataFrame.from_dict(main_loop.log, orient='index') col = 'valid_error_error_rate_clean' logger.info('%s %g' % (col, df[col].iloc[-1])) if main_loop.log.status['epoch_interrupt_received']: return None return df
def run(): # Load Model net_size = 256 #Hard-code instead of loading model (takes too long to set up network) #net = vaegan.VAEGAN() #network_saver = saver.NetworkSaver('vaegan/models/', net=net) #network_saver.load() # DATA train_stream = get_stream(hdf5_file, 'train', batch_size) #TODO jonathan ? test_stream = get_stream(hdf5_file, 'test', batch_size) #TODO jonathan ? # MODEL x = T.TensorType('floatX', [False] * 3)('features') y = T.tensor3('targets', dtype='floatX') train_flag = [theano.shared(0)] x = x.swapaxes(0, 1) y = y.swapaxes(0, 1) # More Config out_size = len(output_columns) - 1 # code_mode=RL-MDN latent_size = net_size in_size = latent_size + len(input_columns) # NN fprop y_hat, cost, cells = nn_fprop(x, y, in_size, out_size, hidden_size, num_recurrent_layers, train_flag) # COST cg = ComputationGraph(cost) extra_updates = [] # RMS Prop training optimizer step_rules = [ RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping) ] parameters_to_update = cg.parameters algorithm = GradientDescent(cost=cg.outputs[0], parameters=parameters_to_update, step_rule=CompositeRule(step_rules)) algorithm.add_updates( extra_updates) # TODO jonathan what is this, is this needed? # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [ cost, step_rules[0].learning_rate, gradient_norm, step_norm ] test_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True, data_stream=test_stream, prefix="test") train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_epoch=True, before_first_epoch=True, prefix='train') set_train_flag = SetTrainFlag(after_epoch=True, before_epoch=True, flag=train_flag) # plot = Plot('Plotting example', channels=[['cost']], after_batch=True, open_browser=True) extensions = [ set_train_flag, test_monitor, train_monitor, Timing(), Printing(after_epoch=True), FinishAfter(after_n_epochs=nepochs), saveload.Load(load_path), saveload.Checkpoint(last_path, every_n_epochs=10000), ] + track_best('test_cost', save_path) #+ track_best('train_cost', last_path) if learning_rate_decay not in (0, 1): extensions.append( SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: np.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=False, every_n_epochs=lr_decay_every_n_epochs, after_batch=False)) print 'number of parameters in the model: ' + str( T.sum([p.size for p in cg.parameters]).eval()) # Finally build the main loop and train the model mainLoop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) mainLoop.run()
def main(nvis, nhid, encoding_lstm_dim, decoding_lstm_dim, T=1): x = tensor.matrix('features') # Construct and initialize model encoding_mlp = MLP([Tanh()], [None, None]) decoding_mlp = MLP([Tanh()], [None, None]) encoding_lstm = LSTM(dim=encoding_lstm_dim) decoding_lstm = LSTM(dim=decoding_lstm_dim) draw = DRAW(nvis=nvis, nhid=nhid, T=T, encoding_mlp=encoding_mlp, decoding_mlp=decoding_mlp, encoding_lstm=encoding_lstm, decoding_lstm=decoding_lstm, biases_init=Constant(0), weights_init=Orthogonal()) draw.push_initialization_config() encoding_lstm.weights_init = IsotropicGaussian(std=0.001) decoding_lstm.weights_init = IsotropicGaussian(std=0.001) draw.initialize() # Compute cost cost = -draw.log_likelihood_lower_bound(x).mean() cost.name = 'nll_upper_bound' model = Model(cost) # Datasets and data streams mnist_train = BinarizedMNIST('train') train_loop_stream = ForceFloatX( DataStream(dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 100))) train_monitor_stream = ForceFloatX( DataStream(dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 500))) mnist_valid = BinarizedMNIST('valid') valid_monitor_stream = ForceFloatX( DataStream(dataset=mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500))) mnist_test = BinarizedMNIST('test') test_monitor_stream = ForceFloatX( DataStream(dataset=mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 500))) # Get parameters and monitoring channels computation_graph = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(computation_graph.variables) monitoring_channels = dict([ ('avg_' + channel.tag.name, channel.mean()) for channel in VariableFilter( name='.*term$')(computation_graph.auxiliary_variables) ]) for name, channel in monitoring_channels.items(): channel.name = name monitored_quantities = monitoring_channels.values() + [cost] # Training loop step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) algorithm.add_updates(computation_graph.updates) main_loop = MainLoop( model=model, data_stream=train_loop_stream, algorithm=algorithm, extensions=[ Timing(), SerializeMainLoop('vae.pkl', save_separately=['model']), FinishAfter(after_n_epochs=200), DataStreamMonitoring(monitored_quantities, train_monitor_stream, prefix="train", updates=computation_graph.updates), DataStreamMonitoring(monitored_quantities, valid_monitor_stream, prefix="valid", updates=computation_graph.updates), DataStreamMonitoring(monitored_quantities, test_monitor_stream, prefix="test", updates=computation_graph.updates), ProgressBar(), Printing() ]) main_loop.run()
def main(num_epochs=50, batch_normalized=True, alpha=0.1): """Run the example. Parameters ---------- num_epochs : int, optional Number of epochs for which to train. batch_normalized : bool, optional Batch-normalize the training graph. Defaults to `True`. alpha : float, optional Weight to apply to a new sample when calculating running averages for population statistics (1 - alpha weight is given to the existing average). """ if batch_normalized: # Add an extra keyword argument that only BatchNormalizedMLP takes, # in order to speed things up at the cost of a bit of extra memory. mlp_class = BatchNormalizedMLP extra_kwargs = {'conserve_memory': False} else: mlp_class = MLP extra_kwargs = {} mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()], [2, 5, 5, 5, 3], weights_init=IsotropicGaussian(0.2), biases_init=Constant(0.), **extra_kwargs) mlp.initialize() # Generate a dataset with 3 spiral arms, using 8000 examples for # training and 2000 for testing. dataset = Spiral(num_examples=10000, classes=3, sources=['features', 'label'], noise=0.05) train_stream = DataStream(dataset, iteration_scheme=ShuffledScheme(examples=8000, batch_size=20)) test_stream = DataStream(dataset, iteration_scheme=SequentialScheme( examples=list(range(8000, 10000)), batch_size=2000)) # Build a cost graph; this contains BatchNormalization bricks that will # by default run in inference mode. features = tensor.matrix('features') label = tensor.lvector('label') prediction = mlp.apply(features) cost = CategoricalCrossEntropy().apply(label, prediction) misclass = MisclassificationRate().apply(label, prediction) misclass.name = 'misclass' # The default name for this is annoyingly long original_cg = ComputationGraph([cost, misclass]) if batch_normalized: cg = apply_batch_normalization(original_cg) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] else: cg = original_cg extra_updates = [] algorithm = GradientDescent(step_rule=Adam(0.001), cost=cg.outputs[0], parameters=cg.parameters) algorithm.add_updates(extra_updates) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, # Use the original cost and misclass variables so # that we monitor the (original) inference-mode graph. extensions=[ DataStreamMonitoring([cost, misclass], train_stream, prefix='train'), DataStreamMonitoring([cost, misclass], test_stream, prefix='test'), Printing(), FinishAfter(after_n_epochs=num_epochs) ]) main_loop.run() return main_loop
def construct_main_loop(name, task_name, patch_shape, batch_size, n_spatial_dims, n_patches, max_epochs, patience_epochs, learning_rate, gradient_limiter, hyperparameters, **kwargs): task = tasks.get_task(**hyperparameters) hyperparameters["n_channels"] = task.n_channels extensions = [] # let theta noise decay as training progresses for key in "location_std scale_std".split(): hyperparameters[key] = theano.shared(hyperparameters[key], name=key) extensions.append(util.ExponentialDecay( hyperparameters[key], hyperparameters["%s_decay" % key], after_batch=True)) print "constructing graphs..." graphs, outputs, updates = construct_graphs(task=task, **hyperparameters) print "setting up main loop..." from blocks.model import Model model = Model(outputs["train"]["cost"]) from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam, RMSProp from extensions import Compressor if gradient_limiter == "clip": limiter = StepClipping(1.) elif gradient_limiter == "compress": limiter = Compressor() else: raise ValueError() algorithm = GradientDescent( cost=outputs["train"]["cost"], parameters=graphs["train"].parameters, step_rule=CompositeRule([limiter, Adam(learning_rate=learning_rate)])) algorithm.add_updates(updates["train"]) extensions.extend(construct_monitors( algorithm=algorithm, task=task, model=model, graphs=graphs, outputs=outputs, updates=updates, **hyperparameters)) from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing from blocks.extensions.stopping import FinishIfNoImprovementAfter from blocks.extensions.training import TrackTheBest from blocks.extensions.saveload import Checkpoint from dump import DumpBest, LightCheckpoint, PrintingTo, DumpGraph, DumpLog extensions.extend([ TrackTheBest("valid_error_rate", "best_valid_error_rate"), FinishIfNoImprovementAfter("best_valid_error_rate", epochs=patience_epochs), FinishAfter(after_n_epochs=max_epochs), DumpBest("best_valid_error_rate", name+"_best.zip"), Checkpoint(hyperparameters["checkpoint_save_path"], on_interrupt=False, every_n_epochs=10, use_cpickle=True), DumpLog("log.pkl", after_epoch=True), ProgressBar(), Timing(), Printing(), PrintingTo(name+"_log"), DumpGraph(name+"_grad_graph")]) from blocks.main_loop import MainLoop main_loop = MainLoop(data_stream=task.get_stream("train"), algorithm=algorithm, extensions=extensions, model=model) from tabulate import tabulate print "parameter sizes:" print tabulate((key, "x".join(map(str, value.get_value().shape)), value.get_value().size) for key, value in main_loop.model.get_parameter_dict().items()) return main_loop
def train_model(cost, cross_entropy, updates, train_stream, valid_stream, args, gate_values=None): step_rule = learning_algorithm(args) cg = ComputationGraph(cost) # ADD REGULARIZATION # WEIGHT NOISE weight_noise = args.weight_noise if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg_train = apply_noise(cg, weights, weight_noise) cost = cg_train.outputs[0] cost.name = "cost_with_weight_noise" cg = ComputationGraph(cost) logger.info(cg.parameters) algorithm = GradientDescent(cost=cost, step_rule=step_rule, params=cg.parameters) algorithm.add_updates(updates) # extensions to be added extensions = [] if args.load_path is not None: extensions.append(Load(args.load_path)) outputs = [ variable for variable in cg.variables if variable.name == "presoft" ] if args.generate: extensions.append( TextGenerationExtension( outputs=outputs, generation_length=args.generated_text_lenght, initial_text_length=args.initial_text_length, every_n_batches=args.monitoring_freq, ploting_path=os.path.join(args.save_path, 'prob_plot.png'), softmax_sampling=args.softmax_sampling, dataset=args.dataset, updates=updates, interactive_mode=args.interactive_mode)) extensions.extend([ TrainingDataMonitoring([cost], prefix='train', every_n_batches=args.monitoring_freq, after_epoch=True), DataStreamMonitoring([cost, cross_entropy], valid_stream, args.mini_batch_size_valid, state_updates=updates, prefix='valid', before_first_epoch=not (args.visualize_gates), every_n_batches=args.monitoring_freq), ResetStates([v for v, _ in updates], every_n_batches=100), ProgressBar() ]) # Creating directory for saving model. if not args.interactive_mode: if not os.path.exists(args.save_path): os.makedirs(args.save_path) else: raise Exception('Directory already exists') early_stopping = EarlyStopping('valid_cross_entropy', args.patience, args.save_path, every_n_batches=args.monitoring_freq) # Visualizing extensions if args.interactive_mode: extensions.append(InteractiveMode()) if args.visualize_gates and (gate_values is not None): if args.rnn_type == "lstm": extensions.append( VisualizeGateLSTM(gate_values, updates, args.dataset, ploting_path=None)) elif args.rnn_type == "soft": extensions.append( VisualizeGateSoft(gate_values, updates, args.dataset, ploting_path=None)) else: assert (False) extensions.append(early_stopping) extensions.append(Printing(every_n_batches=args.monitoring_freq)) main_loop = MainLoop(model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(): nclasses = 27 import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=1) parser.add_argument("--length", type=int, default=180) parser.add_argument("--num-epochs", type=int, default=100) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=1e-3) parser.add_argument("--epsilon", type=float, default=1e-5) parser.add_argument("--num-hidden", type=int, default=1000) parser.add_argument("--baseline", action="store_true") parser.add_argument("--initialization", choices="identity glorot orthogonal uniform".split(), default="identity") parser.add_argument("--initial-gamma", type=float, default=1e-1) parser.add_argument("--initial-beta", type=float, default=0) parser.add_argument("--cluster", action="store_true") parser.add_argument("--activation", choices=list(activations.keys()), default="tanh") parser.add_argument("--optimizer", choices="sgdmomentum adam rmsprop", default="rmsprop") parser.add_argument("--continue-from") parser.add_argument("--evaluate") parser.add_argument("--dump-hiddens") args = parser.parse_args() np.random.seed(args.seed) blocks.config.config.default_seed = args.seed if args.continue_from: from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses) ### optimization algorithm definition if args.optimizer == "adam": optimizer = Adam(learning_rate=args.learning_rate) elif args.optimizer == "rmsprop": optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9) elif args.optimizer == "sgdmomentum": optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99) step_rule = CompositeRule([ StepClipping(1.), optimizer, ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ]) step_channels.append( algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append( algorithm.total_gradient_norm.copy(name="total_gradient_norm")) step_channels.extend(graphs["training"].outputs) logger.warning("constructing training data monitor") extensions.append( TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True)) # parameter monitor extensions.append( DataStreamMonitoring([ param.norm(2).copy(name="parameter.norm:%s" % name) for name, param in model.get_parameter_dict().items() ], data_stream=None, after_epoch=True)) validation_interval = 500 # performance monitor for situation in "training inference".split(): if situation == "inference" and not args.evaluate: # save time when we don't need the inference graph continue for which_set in "train valid test".split(): logger.warning("constructing %s %s monitor" % (which_set, situation)) channels = list(graphs[situation].outputs) extensions.append( DataStreamMonitoring(channels, prefix="%s_%s" % (which_set, situation), every_n_batches=validation_interval, data_stream=get_stream( which_set=which_set, batch_size=args.batch_size, num_examples=10000, length=args.length))) extensions.extend([ TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"), DumpBest("best_valid_training_error_rate", "best.zip"), FinishAfter(after_n_epochs=args.num_epochs), #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50), Checkpoint("checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True), DumpLog("log.pkl", after_epoch=True) ]) if not args.cluster: extensions.append(ProgressBar()) extensions.extend([ Timing(), Printing(every_n_batches=validation_interval), PrintingTo("log"), ]) main_loop = MainLoop(data_stream=get_stream(which_set="train", batch_size=args.batch_size, length=args.length, augment=True), algorithm=algorithm, extensions=extensions, model=model) if args.dump_hiddens: dump_hiddens(args, main_loop) return if args.evaluate: evaluate(args, main_loop) return main_loop.run()
def main(num_epochs=50, batch_normalized=True, alpha=0.1): """Run the example. Parameters ---------- num_epochs : int, optional Number of epochs for which to train. batch_normalized : bool, optional Batch-normalize the training graph. Defaults to `True`. alpha : float, optional Weight to apply to a new sample when calculating running averages for population statistics (1 - alpha weight is given to the existing average). """ if batch_normalized: # Add an extra keyword argument that only BatchNormalizedMLP takes, # in order to speed things up at the cost of a bit of extra memory. mlp_class = BatchNormalizedMLP extra_kwargs = {'conserve_memory': False} else: mlp_class = MLP extra_kwargs = {} mlp = mlp_class([Logistic(), Logistic(), Logistic(), Softmax()], [2, 5, 5, 5, 3], weights_init=IsotropicGaussian(0.2), biases_init=Constant(0.), **extra_kwargs) mlp.initialize() # Generate a dataset with 3 spiral arms, using 8000 examples for # training and 2000 for testing. dataset = Spiral(num_examples=10000, classes=3, sources=['features', 'label'], noise=0.05) train_stream = DataStream(dataset, iteration_scheme=ShuffledScheme(examples=8000, batch_size=20)) test_stream = DataStream(dataset, iteration_scheme=SequentialScheme( examples=list(range(8000, 10000)), batch_size=2000)) # Build a cost graph; this contains BatchNormalization bricks that will # by default run in inference mode. features = tensor.matrix('features') label = tensor.lvector('label') prediction = mlp.apply(features) cost = CategoricalCrossEntropy().apply(label, prediction) misclass = MisclassificationRate().apply(label, prediction) misclass.name = 'misclass' # The default name for this is annoyingly long original_cg = ComputationGraph([cost, misclass]) if batch_normalized: cg = apply_batch_normalization(original_cg) # Add updates for population parameters pop_updates = get_batch_normalization_updates(cg) extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] else: cg = original_cg extra_updates = [] algorithm = GradientDescent(step_rule=Adam(0.001), cost=cg.outputs[0], parameters=cg.parameters) algorithm.add_updates(extra_updates) main_loop = MainLoop(algorithm=algorithm, data_stream=train_stream, # Use the original cost and misclass variables so # that we monitor the (original) inference-mode graph. extensions=[DataStreamMonitoring([cost, misclass], train_stream, prefix='train'), DataStreamMonitoring([cost, misclass], test_stream, prefix='test'), Printing(), FinishAfter(after_n_epochs=num_epochs)]) main_loop.run() return main_loop
def main(save_to, num_epochs, regularization=0.0001, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 convnet = create_res_net() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # Apply regularization to the cost biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) l2_norm = sum([(W ** 2).sum() for W in weights]) l2_norm.name = 'l2_norm' l2_regularization = regularization * l2_norm l2_regularization.name = 'l2_regularization' test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + regularization * l2_norm train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 500 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=train_cg.parameters, step_rule=momentum) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (1, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), DataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_cost_without_regularization, l2_regularization, momentum.learning_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + save_to, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ], every_n_batches=17), Plot('Test performance for ' + save_to, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), Checkpoint(save_to, use_cpickle=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def train_ladder(cli_params, dataset=None, save_to='results/ova_all_full'): cli_params['save_dir'] = prepare_dir(save_to) logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr)) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) short_prints = { "train": { 'T_C_class': ladder.costs.class_corr, 'T_C_de': ladder.costs.denois.values(), }, "valid_approx": OrderedDict([ ('V_C_class', ladder.costs.class_clean), ('V_E', ladder.error.clean), ('V_C_de', ladder.costs.denois.values()), ]), "valid_final": OrderedDict([ ('VF_C_class', ladder.costs.class_clean), ('VF_E', ladder.error.clean), ('VF_C_de', ladder.costs.denois.values()), ]), } ovadataset = dataset['ovadataset'] train_indexes = dataset['train_indexes'] val_indexes = dataset['val_indexes'] main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(ovadataset, train_indexes, p.batch_size, scheme=ShuffledScheme), model=Model(ladder.costs.total), extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + ladder.costs.denois.values(), make_datastream(ovadataset, val_indexes, p.batch_size), prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( [ladder.costs.class_clean, ladder.error.clean_mc] + ladder.costs.denois.values(), make_datastream(ovadataset, train_indexes, p.batch_size), make_datastream(ovadataset, val_indexes, p.batch_size), prefix="valid_final", after_n_epochs=p.num_epochs), TrainingDataMonitoring([ ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm ] + ladder.costs.denois.values(), prefix="train", after_epoch=True), ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ]) main_loop.run() # Get results df = main_loop.log.to_dataframe() col = 'valid_final_error_matrix_cost' logger.info('%s %g' % (col, df[col].iloc[-1])) ds = make_datastream(ovadataset, val_indexes, p.batch_size) outputs = ladder.act.clean.labeled.h[len(ladder.layers) - 1] outputreplacer = TestMonitoring() _, _, outputs = outputreplacer._get_bn_params(outputs) cg = ComputationGraph(outputs) f = cg.get_theano_function() it = ds.get_epoch_iterator(as_dict=True) res = [] inputs = { 'features_labeled': [], 'targets_labeled': [], 'features_unlabeled': [] } # Loop over one epoch for d in it: # Store all inputs for k, v in d.iteritems(): inputs[k] += [v] # Store outputs res += [f(*[d[str(inp)] for inp in cg.inputs])] # Concatenate all minibatches res = [numpy.vstack(minibatches) for minibatches in zip(*res)] inputs = {k: numpy.vstack(v) for k, v in inputs.iteritems()} if main_loop.log.status['epoch_interrupt_received']: return None return res[0], inputs
def construct_main_loop(name, task_name, patch_shape, batch_size, n_spatial_dims, n_patches, max_epochs, patience_epochs, learning_rate, gradient_limiter, hyperparameters, **kwargs): task = tasks.get_task(**hyperparameters) hyperparameters["n_channels"] = task.n_channels extensions = [] # let theta noise decay as training progresses for key in "location_std scale_std".split(): hyperparameters[key] = theano.shared(hyperparameters[key], name=key) extensions.append( util.ExponentialDecay(hyperparameters[key], hyperparameters["%s_decay" % key], after_batch=True)) print "constructing graphs..." graphs, outputs, updates = construct_graphs(task=task, **hyperparameters) print "setting up main loop..." from blocks.model import Model model = Model(outputs["train"]["cost"]) from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam, RMSProp from extensions import Compressor if gradient_limiter == "clip": limiter = StepClipping(1.) elif gradient_limiter == "compress": limiter = Compressor() else: raise ValueError() algorithm = GradientDescent( cost=outputs["train"]["cost"], parameters=graphs["train"].parameters, step_rule=CompositeRule([limiter, Adam(learning_rate=learning_rate)])) algorithm.add_updates(updates["train"]) extensions.extend( construct_monitors(algorithm=algorithm, task=task, model=model, graphs=graphs, outputs=outputs, updates=updates, **hyperparameters)) from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing from blocks.extensions.stopping import FinishIfNoImprovementAfter from blocks.extensions.training import TrackTheBest from blocks.extensions.saveload import Checkpoint from dump import DumpBest, LightCheckpoint, PrintingTo, DumpGraph, DumpLog extensions.extend([ TrackTheBest("valid_error_rate", "best_valid_error_rate"), FinishIfNoImprovementAfter("best_valid_error_rate", epochs=patience_epochs), FinishAfter(after_n_epochs=max_epochs), DumpBest("best_valid_error_rate", name + "_best.zip"), Checkpoint(hyperparameters["checkpoint_save_path"], on_interrupt=False, every_n_epochs=10, use_cpickle=True), DumpLog("log.pkl", after_epoch=True), ProgressBar(), Timing(), Printing(), PrintingTo(name + "_log"), DumpGraph(name + "_grad_graph") ]) from blocks.main_loop import MainLoop main_loop = MainLoop(data_stream=task.get_stream("train"), algorithm=algorithm, extensions=extensions, model=model) from tabulate import tabulate print "parameter sizes:" print tabulate( (key, "x".join(map(str, value.get_value().shape)), value.get_value().size) for key, value in main_loop.model.get_parameter_dict().items()) return main_loop
def run(batch_size, classifier, oldmodel, monitor_every, checkpoint_every, final_epoch, dataset, color_convert, image_size, net_depth, allowed, stretch): streams = create_custom_streams(filename=dataset, training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=True, color_convert=color_convert, allowed=allowed, stretch=stretch) main_loop_stream = streams[0] train_monitor_stream = streams[1] valid_monitor_stream = streams[2] cg, bn_dropout_cg = create_training_computation_graphs(image_size, net_depth) model = Model(bn_dropout_cg.outputs[0]) # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. pop_updates = get_batch_normalization_updates(bn_dropout_cg) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_dropout_cg.outputs[0], parameters=bn_dropout_cg.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring cost = bn_dropout_cg.outputs[0] cost.name = 'cost' train_monitoring = DataStreamMonitoring( [cost], train_monitor_stream, prefix="train", before_first_epoch=False, after_epoch=False, after_training=True, updates=extra_updates) cost, accuracy = cg.outputs cost.name = 'cost' accuracy.name = 'accuracy' monitored_quantities = [cost, accuracy] valid_monitoring = DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid", before_first_epoch=True, after_epoch=False, every_n_epochs=monitor_every) # Prepare checkpoint checkpoint = Checkpoint(classifier, every_n_epochs=checkpoint_every, use_cpickle=True) extensions = [Timing(), FinishAfter(after_n_epochs=final_epoch), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar()] main_loop = MainLoop(model=model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) if oldmodel is not None: print("Initializing parameters with old model {}".format(oldmodel)) with open(oldmodel, 'rb') as src: saved_model = load(src) main_loop.model.set_parameter_values( saved_model.model.get_parameter_values()) del saved_model main_loop.run()
def train(cli_params): cli_params['save_dir'] = prepare_dir(cli_params['save_to']) logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=False) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim,) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, parameters=all_params, step_rule=Adam(learning_rate=ladder.lr.get_value())) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) short_prints = { "train": { 'T_C_class': ladder.costs.class_corr, 'T_C_de': ladder.costs.denois.values(), }, "valid_approx": OrderedDict([ ('V_C_class', ladder.costs.class_clean), ('V_E', ladder.error.clean), ('V_C_de', ladder.costs.denois.values()), ]), "valid_final": OrderedDict([ ('VF_C_class', ladder.costs.class_clean), ('VF_E', ladder.error.clean), ('VF_C_de', ladder.costs.denois.values()), ]), } main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm), model=Model(ladder.costs.total), extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + ladder.costs.denois.values(), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + ladder.costs.denois.values(), make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, n_labeled=len(data.valid_ind), whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_final", after_n_epochs=p.num_epochs), TrainingDataMonitoring( [ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm] + ladder.costs.denois.values(), prefix="train", after_epoch=True), SaveParams(None, all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(p.save_dir, after_training=True), ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ]) main_loop.run() # Get results df = DataFrame.from_dict(main_loop.log, orient='index') col = 'valid_final_error_rate_clean' logger.info('%s %g' % (col, df[col].iloc[-1])) if main_loop.log.status['epoch_interrupt_received']: return None return df
def main(mode, save_to, num_epochs, load_params=None, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, stride=None, repeat_times=None, batch_size=None, num_batches=None, algo=None, test_set=None, valid_examples=None, dropout=None, max_norm=None, weight_decay=None, batch_norm=None): if feature_maps is None: feature_maps = [20, 50, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5, 5] if pool_sizes is None: pool_sizes = [2, 2, 2] if repeat_times is None: repeat_times = [1, 1, 1] if batch_size is None: batch_size = 500 if valid_examples is None: valid_examples = 2500 if stride is None: stride = 1 if test_set is None: test_set = 'test' if algo is None: algo = 'rmsprop' if batch_norm is None: batch_norm = False image_size = (128, 128) output_size = 2 if (len(feature_maps) != len(conv_sizes) or len(feature_maps) != len(pool_sizes) or len(feature_maps) != len(repeat_times)): raise ValueError("OMG, inconsistent arguments") # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, stride=stride, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), repeat_times=repeat_times, top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', batch_norm=batch_norm, weights_init=Glorot(), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) single_x = tensor.tensor3('image_features') x = tensor.tensor4('image_features') single_y = tensor.lvector('targets') y = tensor.lmatrix('targets') # Training with batch_normalization(convnet): probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) extra_updates = [] if batch_norm: # batch norm: logger.debug("Apply batch norm") pop_updates = get_batch_normalization_updates(cg) # p stands for population mean # m stands for minibatch alpha = 0.005 extra_updates = [(p, m * alpha + p * (1 - alpha)) for p, m in pop_updates] population_statistics = [p for p, m in extra_updates] if dropout: relu_outputs = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg) cg = apply_dropout(cg, relu_outputs, dropout) cost, error_rate = cg.outputs if weight_decay: logger.debug("Apply weight decay {}".format(weight_decay)) cost += weight_decay * l2_norm(cg.parameters) cost.name = 'cost' # Validation valid_probs = convnet.apply_5windows(single_x) valid_cost = (CategoricalCrossEntropy().apply(single_y, valid_probs) .copy(name='cost')) valid_error_rate = (MisclassificationRate().apply( single_y, valid_probs).copy(name='error_rate')) model = Model([cost, error_rate]) if load_params: logger.info("Loaded params from {}".format(load_params)) with open(load_params, 'r') as src: model.set_parameter_values(load_parameters(src)) # Training stream with random cropping train = DogsVsCats(("train",), subset=slice(None, 25000 - valid_examples, None)) train_str = DataStream( train, iteration_scheme=ShuffledScheme(train.num_examples, batch_size)) train_str = add_transformers(train_str, random_crop=True) # Validation stream without cropping valid = DogsVsCats(("train",), subset=slice(25000 - valid_examples, None, None)) valid_str = DataStream( valid, iteration_scheme=SequentialExampleScheme(valid.num_examples)) valid_str = add_transformers(valid_str) if mode == 'train': directory, _ = os.path.split(sys.argv[0]) env = dict(os.environ) env['THEANO_FLAGS'] = 'floatX=float32' port = numpy.random.randint(1025, 10000) server = subprocess.Popen( [directory + '/server.py', str(25000 - valid_examples), str(batch_size), str(port)], env=env, stderr=subprocess.STDOUT) train_str = ServerDataStream( ('image_features', 'targets'), produces_examples=False, port=port) save_to_base, save_to_extension = os.path.splitext(save_to) # Train with simple SGD if algo == 'rmsprop': step_rule = RMSProp(decay_rate=0.999, learning_rate=0.0003) elif algo == 'adam': step_rule = Adam() else: assert False if max_norm: conv_params = VariableFilter(bricks=[Convolutional], roles=[WEIGHT])(cg) linear_params = VariableFilter(bricks=[Linear], roles=[WEIGHT])(cg) step_rule = CompositeRule( [step_rule, Restrict(VariableClipping(max_norm, axis=0), linear_params), Restrict(VariableClipping(max_norm, axis=(1, 2, 3)), conv_params)]) algorithm = GradientDescent( cost=cost, parameters=model.parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(every_n_batches=100), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring( [valid_cost, valid_error_rate], valid_str, prefix="valid"), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), TrackTheBest("valid_error_rate"), Checkpoint(save_to, save_separately=['log'], parameters=cg.parameters + (population_statistics if batch_norm else []), before_training=True, after_epoch=True) .add_condition( ['after_epoch'], OnLogRecord("valid_error_rate_best_so_far"), (save_to_base + '_best' + save_to_extension,)), Printing(every_n_batches=100)] model = Model(cost) main_loop = MainLoop( algorithm, train_str, model=model, extensions=extensions) try: main_loop.run() finally: server.terminate() elif mode == 'test': classify = theano.function([single_x], valid_probs.argmax()) test = DogsVsCats((test_set,)) test_str = DataStream( test, iteration_scheme=SequentialExampleScheme(test.num_examples)) test_str = add_transformers(test_str) correct = 0 with open(save_to, 'w') as dst: print("id", "label", sep=',', file=dst) for index, example in enumerate(test_str.get_epoch_iterator()): image = example[0] prediction = classify(image) print(index + 1, classify(image), sep=',', file=dst) if len(example) > 1 and prediction == example[1]: correct += 1 print(correct / float(test.num_examples)) else: assert False
def train(cli_params): fn = 'noname' if 'save_to' in nodefaultargs or not cli_params.get('load_from'): fn = cli_params['save_to'] cli_params['save_dir'] = prepare_dir(fn) nodefaultargs.append('save_dir') logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=False) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim, ) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. # you can turn off BN by setting is_normalizing = False in ladder.py bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert not bn_updates or 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, parameters=all_params, step_rule=Adam(learning_rate=ladder.lr)) # In addition to actual training, also do BN variable approximations if bn_updates: training_algorithm.add_updates(bn_updates) short_prints = { "train": OrderedDict([ ('T_E', ladder.error.clean), ('T_O', ladder.oos.clean), ('T_C_class', ladder.costs.class_corr), ('T_C_de', ladder.costs.denois.values()), ('T_T', ladder.costs.total), ]), "valid_approx": OrderedDict([ ('V_C_class', ladder.costs.class_clean), ('V_E', ladder.error.clean), ('V_O', ladder.oos.clean), ('V_C_de', ladder.costs.denois.values()), ('V_T', ladder.costs.total), ]), "valid_final": OrderedDict([ ('VF_C_class', ladder.costs.class_clean), ('VF_E', ladder.error.clean), ('VF_O', ladder.oos.clean), ('VF_C_de', ladder.costs.denois.values()), ('V_T', ladder.costs.total), ]), } if len(data.valid_ind): main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm, balanced_classes=p.balanced_classes, dseed=p.dseed), model=Model(ladder.costs.total), extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring([ ladder.costs.class_clean, ladder.error.clean, ladder.oos.clean, ladder.costs.total ] + ladder.costs.denois.values(), make_datastream( data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, balanced_classes=p.balanced_classes, scheme=ShuffledScheme), prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( [ ladder.costs.class_clean, ladder.error.clean, ladder.oos.clean, ladder.costs.total ] + ladder.costs.denois.values(), make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, balanced_classes=p.balanced_classes, scheme=ShuffledScheme), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, n_labeled=len(data.valid_ind), whiten=whiten, cnorm=cnorm, balanced_classes=p.balanced_classes, scheme=ShuffledScheme), prefix="valid_final", after_n_epochs=p.num_epochs, after_training=True), TrainingDataMonitoring([ ladder.error.clean, ladder.oos.clean, ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm ] + ladder.costs.denois.values(), prefix="train", after_epoch=True), # ladder.costs.class_clean - save model whenever we have best validation result another option `('train',ladder.costs.total)` SaveParams(('valid_approx', ladder.error.clean), all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(p.save_dir, after_training=True), ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, lrmin=p.lrmin, after_epoch=True), ]) else: main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm, balanced_classes=p.balanced_classes, dseed=p.dseed), model=Model(ladder.costs.total), extensions=[ FinishAfter(after_n_epochs=p.num_epochs), TrainingDataMonitoring([ ladder.error.clean, ladder.oos.clean, ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm ] + ladder.costs.denois.values(), prefix="train", after_epoch=True), # ladder.costs.class_clean - save model whenever we have best validation result another option `('train',ladder.costs.total)` SaveParams(('train', ladder.error.clean), all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(p.save_dir, after_training=True), ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, lrmin=p.lrmin, after_epoch=True), ]) main_loop.run() # Get results if len(data.valid_ind) == 0: return None df = DataFrame.from_dict(main_loop.log, orient='index') col = 'valid_final_error_rate_clean' logger.info('%s %g' % (col, df[col].iloc[-1])) if main_loop.log.status['epoch_interrupt_received']: return None return df
def run(model_name, port_train, port_valid): running_on_laptop = socket.gethostname() == 'yop' X = tensor.tensor4('image_features', dtype='float32') T = tensor.matrix('targets', dtype='float32') image_border_size = (100, 100) if running_on_laptop: host_plot = 'http://*****:*****@ %s' % (model_name, datetime.datetime.now(), socket.gethostname()), channels=[['loss'], ['error', 'valid_error']], after_epoch=True, server_url=host_plot), Printing(), Checkpoint('/tmp/train_bn2') ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions, model=model) main_loop.run()
def main(name, epochs, batch_size, learning_rate, window_size, conv_sizes, num_filters, fc_dim, enc_dim, dec_dim, step, num_digits, num_classes, oldmodel, live_plotting): channels, img_height, img_width = 1, 100, 100 rnninits = { 'weights_init': Uniform(width=0.02), 'biases_init': Constant(0.), } inits = { 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } rec_inits = { 'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } convinits = { 'weights_init': Uniform(width=.2), 'biases_init': Constant(0.), } n_iter = step * num_digits filter_size1, filter_size2 = zip(conv_sizes, conv_sizes)[:] w_height, w_width = window_size.split(',') w_height = int(w_height) w_width = int(w_width) subdir = time.strftime("%Y-%m-%d") + "-" + name if not os.path.exists(subdir): os.makedirs(subdir) lines = ["\n Running experiment", " subdirectory: %s" % subdir, " learning rate: %g" % learning_rate, " attention size: %s" % window_size, " n_iterations: %d" % n_iter, " encoder dimension: %d" % enc_dim, " decoder dimension: %d" % dec_dim, " batch size: %d" % batch_size, " epochs: %d" % epochs, ] for line in lines: print(line) print() rectifier = Rectifier() conv1 = Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 2), num_channels=channels, image_size=(w_height, w_width), border_mode='half', name='conv1', **convinits) conv1_bn = SpatialBatchNormalization(input_dim=(64, 26, 26), conserve_memory=False, n_iter=n_iter, name='conv1_bn') conv2 = Convolutional(filter_size=filter_size2, num_channels=int(num_filters / 2), num_filters=int(num_filters / 2), image_size=(26, 26), name='conv2', **convinits) conv2_bn = SpatialBatchNormalization(input_dim=(64, 24, 24), conserve_memory=False, n_iter=n_iter, name='conv2_bn') max_pooling = MaxPooling(pooling_size=(2, 2), step=(2, 2)) conv3 = Convolutional(filter_size=filter_size2, num_filters=num_filters, num_channels=int(num_filters / 2), image_size=(12, 12), border_mode='half', name='conv3', **convinits) conv3_bn = SpatialBatchNormalization(input_dim=(128, 12, 12), conserve_memory=False, n_iter=n_iter, name='conv3_bn') conv4 = Convolutional(filter_size=filter_size2, num_filters=num_filters, num_channels=num_filters, image_size=(12, 12), border_mode='half', name='conv4', **convinits) conv4_bn = SpatialBatchNormalization(input_dim=(128, 12, 12), conserve_memory=False, n_iter=n_iter, name='conv4_bn') # Max Pooling conv5 = Convolutional(filter_size=filter_size2, num_filters=160, num_channels=num_filters, image_size=(6, 6), border_mode='half', name='conv5', **convinits) conv5_bn = SpatialBatchNormalization(input_dim=(160, 6, 6), conserve_memory=False, n_iter=n_iter, name='conv5_bn') conv6 = Convolutional(filter_size=filter_size2, num_filters=192, num_channels=160, image_size=(6, 6), name='conv6', **convinits) conv6_bn = SpatialBatchNormalization(input_dim=(192, 4, 4), conserve_memory=False, n_iter=n_iter, name='conv6_bn') conv_mlp = MLP(activations=[Identity()], dims=[3072, fc_dim], name="MLP_conv", **inits) conv_mlp_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='conv_mlp_bn') loc_mlp = MLP(activations=[Identity()], dims=[6, fc_dim], name="MLP_loc", **inits) loc_mlp_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='loc_mlp_bn') encoder_mlp = MLP([Identity()], [fc_dim, 4 * enc_dim], name="MLP_enc", **rec_inits) decoder_mlp = MLP([Identity()], [enc_dim, 4 * dec_dim], name="MLP_dec", **rec_inits) encoder_rnn = LSTM(activation=Tanh(), dim=enc_dim, name="RNN_enc", **rnninits) conv_init = ConvolutionalSequence( [Convolutional(filter_size=filter_size1, num_filters=int(num_filters / 8), name='conv1_init'), SpatialBatchNormalization(conserve_memory=False, name='conv1_bn_init'), Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 8), name='conv2_init'), SpatialBatchNormalization(conserve_memory=False, name='conv2_bn_init'), Convolutional(filter_size=filter_size2, num_filters=int(num_filters / 4), name='conv3_init'), SpatialBatchNormalization(conserve_memory=False, name='conv3_bn_init'), ], image_size=(12, 12), num_channels=channels, name='conv_seq_init', **convinits) decoder_rnn = LSTM(activation=Tanh(), dim=dec_dim, name="RNN_dec", **rnninits) emit_mlp = MLP(activations=[Tanh()], dims=[dec_dim, 6], name='emit_mlp', weights_init=Constant(0.), biases_init=Constant((1., 0., 0., 0., 1., 0.))) classification_mlp1 = MLP(activations=[Identity()], dims=[enc_dim, fc_dim], name='MPL_class1', **inits) classification_mlp1_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='classification_mlp1_bn') classification_mlp2 = MLP(activations=[Identity()], dims=[fc_dim, fc_dim], name='MPL_class2', **inits) classification_mlp2_bn = BatchNormalization(input_dim=fc_dim, conserve_memory=False, n_iter=n_iter, name='classification_mlp2_bn') classification_mlp3 = MLP(activations=[Softmax()], dims=[fc_dim, num_classes], name='MPL_class3', **inits) edram = EDRAM(channels=channels, out_height=w_height, out_width=w_width, n_iter=n_iter, num_classes=num_classes, rectifier=rectifier, conv1=conv1, conv1_bn=conv1_bn, conv2=conv2, conv2_bn=conv2_bn, max_pooling=max_pooling, conv3=conv3, conv3_bn=conv3_bn, conv4=conv4, conv4_bn=conv4_bn, conv5=conv5, conv5_bn=conv5_bn, conv6=conv6, conv6_bn=conv6_bn, conv_mlp=conv_mlp, conv_mlp_bn=conv_mlp_bn, loc_mlp=loc_mlp, loc_mlp_bn=loc_mlp_bn, conv_init=conv_init, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, classification_mlp1=classification_mlp1, classification_mlp1_bn=classification_mlp1_bn, classification_mlp2=classification_mlp2, classification_mlp2_bn=classification_mlp2_bn, classification_mlp3=classification_mlp3, emit_mlp=emit_mlp) edram.initialize() # ------------------------------------------------------------------------ x = T.ftensor4('features') x_coarse = T.ftensor4('features_coarse') y = T.ivector('labels') wr = T.fmatrix('locations') with batch_normalization(edram): bn_p, bn_l, m_c1_bn, s_c1_bn, m_c2_bn, s_c2_bn, m_c3_bn, s_c3_bn, m_c4_bn, s_c4_bn, m_c5_bn, s_c5_bn, m_c6_bn, s_c6_bn, \ m_c_bn, s_c_bn, m_l_bn, s_l_bn, m_cl1_bn, s_cl1_bn, m_cl2_bn, s_cl2_bn = edram.calculate_train(x, x_coarse) def compute_cost(p, wr, y, l): cost_where = T.dot(T.sqr(wr - l), [1, 0.5, 1, 0.5, 1, 1]) cost_y = T.stack([T.nnet.categorical_crossentropy(T.maximum(p[i, :], 1e-7), y) for i in range(0, n_iter)]) return cost_where, cost_y cost_where, cost_y = compute_cost(bn_p, wr, y, bn_l) bn_cost = cost_y + cost_where bn_cost = bn_cost.sum(axis=0) bn_cost = bn_cost.mean() bn_cost.name = 'cost' bn_error_rate = MisclassificationRate().apply(y, bn_p[-1]) bn_error_rate.name = 'error_rate' # ------------------------------------------------------------ bn_cg = ComputationGraph([bn_cost, bn_error_rate]) # Prepare algorithm algorithm = GradientDescent( cost=bn_cg.outputs[0], on_unused_sources='ignore', parameters=bn_cg.parameters, step_rule=CompositeRule([ RemoveNotFinite(), StepClipping(10.), Adam(learning_rate) ]) ) pop_updates = get_batch_normalization_updates(bn_cg) update_params = [conv1_bn.population_mean, conv1_bn.population_stdev, conv2_bn.population_mean, conv2_bn.population_stdev, conv3_bn.population_mean, conv3_bn.population_stdev, conv4_bn.population_mean, conv4_bn.population_stdev, conv5_bn.population_mean, conv5_bn.population_stdev, conv6_bn.population_mean, conv6_bn.population_stdev, conv_mlp_bn.population_mean, conv_mlp_bn.population_stdev, loc_mlp_bn.population_mean, loc_mlp_bn.population_stdev, classification_mlp1_bn.population_mean, classification_mlp1_bn.population_stdev, classification_mlp2_bn.population_mean, classification_mlp2_bn.population_stdev] update_values = [m_c1_bn, s_c1_bn, m_c2_bn, s_c2_bn, m_c3_bn, s_c3_bn, m_c4_bn, s_c4_bn, m_c5_bn, s_c5_bn, m_c6_bn, s_c6_bn, m_c_bn, s_c_bn, m_l_bn, s_l_bn, m_cl1_bn, s_cl1_bn, m_cl2_bn, s_cl2_bn] pop_updates.extend([(p, m) for p, m in zip(update_params, update_values)]) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] algorithm.add_updates(extra_updates) # ------------------------------------------------------------------------ # Setup monitors p, l = edram.calculate_test(x, x_coarse) cost_where, cost_y = compute_cost(p, wr, y, l) cost = cost_y + cost_where cost = cost.sum(axis=0) cost = cost.mean() cost.name = 'cost' error_rate = MisclassificationRate().apply(y, p[-1]) error_rate.name = 'error_rate' monitors = [cost, error_rate] plotting_extensions = [] # Live plotting... if live_plotting: plot_channels = [ ['train_cost', 'test_cost'], ['train_error_rate', 'test_error_rate'], ] plotting_extensions = [ Plot(subdir, channels=plot_channels, server_url='http://155.69.150.60:80/') ] # ------------------------------------------------------------ mnist_cluttered_train = MNISTCluttered(which_sets=['train'], sources=('features', 'locations', 'labels')) mnist_cluttered_test = MNISTCluttered(which_sets=['test'], sources=('features', 'locations', 'labels')) main_loop = MainLoop( model=Model([bn_cost]), data_stream=DataStream.default_stream(mnist_cluttered_train, iteration_scheme=ShuffledScheme(mnist_cluttered_train.num_examples, batch_size)), algorithm=algorithm, extensions=[Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring(monitors, DataStream.default_stream(mnist_cluttered_train, iteration_scheme=SequentialScheme(mnist_cluttered_train.num_examples, batch_size)), prefix='train'), DataStreamMonitoring(monitors, DataStream.default_stream(mnist_cluttered_test, iteration_scheme=SequentialScheme(mnist_cluttered_test.num_examples, batch_size)), prefix="test"), PartsOnlyCheckpoint("{}/{}".format(subdir, name), before_training=False, after_epoch=True, save_separately=['log', ]), TrackTheBest('test_error_rate', 'best_test_error_rate'), BestCheckpount("{}/{}".format(subdir, name), 'best_test_error_rate', save_separately=['model', ]), Printing(), ProgressBar(), PrintingTo("\n".join(lines), "{}/{}_log.txt".format(subdir, name)), ] + plotting_extensions) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_parameter_values(oldmodel.get_parameter_values()) main_loop.model.get_top_bricks()[0].conv1_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv1_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv1_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv1_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv2_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv2_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv2_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv2_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv3_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv3_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv3_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv3_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv4_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv4_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv4_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv4_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv5_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv5_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv5_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv5_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv6_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv6_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv6_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv6_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].loc_mlp_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].loc_mlp_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].loc_mlp_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].loc_mlp_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_mlp_bn.population_mean.set_value(oldmodel.get_top_bricks()[0].conv_mlp_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_mlp_bn.population_stdev.set_value(oldmodel.get_top_bricks()[0].conv_mlp_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp1_bn.population_mean.set_value( oldmodel.get_top_bricks()[0].classification_mlp1_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp1_bn.population_stdev.set_value( oldmodel.get_top_bricks()[0].classification_mlp1_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp2_bn.population_mean.set_value( oldmodel.get_top_bricks()[0].classification_mlp2_bn.population_mean.get_value()) main_loop.model.get_top_bricks()[0].classification_mlp2_bn.population_stdev.set_value( oldmodel.get_top_bricks()[0].classification_mlp2_bn.population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[1].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[1].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[1].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[1].population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[3].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[3].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[3].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[3].population_stdev.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[5].population_mean.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[5].population_mean.get_value()) main_loop.model.get_top_bricks()[0].conv_init.layers[5].population_stdev.set_value( oldmodel.get_top_bricks()[0].conv_init.layers[5].population_stdev.get_value()) del oldmodel main_loop.run()
def train(cli_params): cli_params['save_dir'] = prepare_dir(cli_params['save_to']) logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=False) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim, ) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr)) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) model = Model(ladder.costs.total) monitored_variables = [ ladder.costs.class_corr, ladder.costs.class_clean, ladder.error, # training_algorithm.total_gradient_norm, ladder.costs.total] \ # + ladder.costs.denois.values() # Make a global history recorder so that we can get summary at end of # training when we write to Sentinel # global_history records all relevant monitoring vars # updated by SaveLog every time global_history = {} main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm), model=model, extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # write out to sentinel file for experiment automator to work SentinelWhenFinish(save_dir=p.save_dir, global_history=global_history), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring(monitored_variables, make_datastream(data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring(monitored_variables, make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, n_labeled=len(data.valid_ind), whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_final", after_n_epochs=p.num_epochs), TrainingDataMonitoring(variables=monitored_variables, prefix="train", after_epoch=True), SaveParams('valid_approx_cost_class_corr', model, p.save_dir), # SaveParams(None, all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(save_dir=p.save_dir, after_epoch=True, global_history=global_history), Printing(), # ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ]) main_loop.run() # Get results df = main_loop.log.to_dataframe() col = 'valid_final_error_rate' logger.info('%s %g' % (col, df[col].iloc[-1])) if main_loop.log.status['epoch_interrupt_received']: return None return df
def run(batch_size, save_path, z_dim, oldmodel, discriminative_regularization, classifier, vintage, monitor_every, monitor_before, checkpoint_every, dataset, color_convert, image_size, net_depth, subdir, reconstruction_factor, kl_factor, discriminative_factor, disc_weights, num_epochs): if dataset: streams = create_custom_streams(filename=dataset, training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=False, color_convert=color_convert) else: streams = create_celeba_streams(training_batch_size=batch_size, monitoring_batch_size=batch_size, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs( z_dim, image_size, net_depth, discriminative_regularization, classifier, vintage, reconstruction_factor, kl_factor, discriminative_factor, disc_weights) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp'))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring sys.setrecursionlimit(1000000) monitored_quantities_list = [] for graph in [bn_cg, cg]: # cost, kl_term, reconstruction_term, discriminative_term = graph.outputs cost, kl_term, reconstruction_term, discriminative_term = graph.outputs[:4] discriminative_layer_terms = graph.outputs[4:] cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' avg_discriminative_term = discriminative_term.mean(axis=0) avg_discriminative_term.name = 'avg_discriminative_term' num_layer_terms = len(discriminative_layer_terms) avg_discriminative_layer_terms = [None] * num_layer_terms for i, term in enumerate(discriminative_layer_terms): avg_discriminative_layer_terms[i] = discriminative_layer_terms[i].mean(axis=0) avg_discriminative_layer_terms[i].name = "avg_discriminative_term_layer_{:02d}".format(i) monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term, avg_discriminative_term] + avg_discriminative_layer_terms) train_monitoring = DataStreamMonitoring( monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=monitor_before, every_n_epochs=monitor_every) valid_monitoring = DataStreamMonitoring( monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=monitor_before, every_n_epochs=monitor_every) # Prepare checkpoint checkpoint = Checkpoint(save_path, every_n_epochs=checkpoint_every, before_training=True, use_cpickle=True) sample_checkpoint = SampleCheckpoint(interface=DiscGenModel, z_dim=z_dim/2, image_size=(image_size, image_size), channels=3, dataset=dataset, split="valid", save_subdir=subdir, before_training=True, after_epoch=True) # TODO: why does z_dim=foo become foo/2? extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), checkpoint, sample_checkpoint, train_monitoring, valid_monitoring, Printing(), ProgressBar()] main_loop = MainLoop(model=model, data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) if oldmodel is not None: print("Initializing parameters with old model {}".format(oldmodel)) try: saved_model = load(oldmodel) except AttributeError: # newer version of blocks with open(oldmodel, 'rb') as src: saved_model = load(src) main_loop.model.set_parameter_values( saved_model.model.get_parameter_values()) del saved_model main_loop.run()
def train(ladder, batch_size=100, num_train_examples=60000, num_epochs=150, lrate_decay=0.67): # Setting Logger timestr = time.strftime("%Y_%m_%d_at_%H_%M") save_path = 'results/mnist_100_standard_' + timestr log_path = os.path.join(save_path, 'log.txt') os.makedirs(save_path) fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) # Training model = Model(ladder.costs.total) all_params = model.parameters print len(all_params) print all_params training_algorithm = GradientDescent( cost=ladder.costs.total, parameters=all_params, step_rule=Adam(learning_rate=ladder.lr)) # Fetch all batch normalization updates. They are in the clean path. # In addition to actual training, also do BN variable approximations bn_updates = ComputationGraph([ladder.costs.class_clean]).updates training_algorithm.add_updates(bn_updates) monitored_variables = [ ladder.costs.class_corr, ladder.costs.class_clean, ladder.error, training_algorithm.total_gradient_norm, ladder.costs.total ] + ladder.costs.denois.values() train_data_stream, test_data_stream = get_mixed_streams(batch_size) train_monitoring = TrainingDataMonitoring(variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring(variables=monitored_variables, data_stream=test_data_stream, prefix="test", after_epoch=True) main_loop = MainLoop(algorithm=training_algorithm, data_stream=train_data_stream, model=model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('test_CE_corr', model, save_path), SaveLog(save_path, after_epoch=True), LRDecay(lr=ladder.lr, decay_first=num_epochs * lrate_decay, decay_last=num_epochs, after_epoch=True), Printing() ]) main_loop.run()
def train_model(cost, cross_entropy, updates, train_stream, valid_stream, args, gate_values=None): step_rule = learning_algorithm(args) cg = ComputationGraph(cost) # ADD REGULARIZATION # WEIGHT NOISE weight_noise = args.weight_noise if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg_train = apply_noise(cg, weights, weight_noise) cost = cg_train.outputs[0] cost.name = "cost_with_weight_noise" cg = ComputationGraph(cost) logger.info(cg.parameters) algorithm = GradientDescent(cost=cost, step_rule=step_rule, params=cg.parameters) algorithm.add_updates(updates) # extensions to be added extensions = [] if args.load_path is not None: extensions.append(Load(args.load_path)) outputs = [ variable for variable in cg.variables if variable.name == "presoft"] if args.generate: extensions.append(TextGenerationExtension( outputs=outputs, generation_length=args.generated_text_lenght, initial_text_length=args.initial_text_length, every_n_batches=args.monitoring_freq, ploting_path=os.path.join(args.save_path, 'prob_plot.png'), softmax_sampling=args.softmax_sampling, dataset=args.dataset, updates=updates, interactive_mode=args.interactive_mode)) extensions.extend([ TrainingDataMonitoring([cost], prefix='train', every_n_batches=args.monitoring_freq, after_epoch=True), DataStreamMonitoring([cost, cross_entropy], valid_stream, args.mini_batch_size_valid, state_updates=updates, prefix='valid', before_first_epoch=not(args.visualize_gates), every_n_batches=args.monitoring_freq), ResetStates([v for v, _ in updates], every_n_batches=100), ProgressBar()]) # Creating directory for saving model. if not args.interactive_mode: if not os.path.exists(args.save_path): os.makedirs(args.save_path) else: raise Exception('Directory already exists') early_stopping = EarlyStopping('valid_cross_entropy', args.patience, args.save_path, every_n_batches=args.monitoring_freq) # Visualizing extensions if args.interactive_mode: extensions.append(InteractiveMode()) if args.visualize_gates and (gate_values is not None): if args.rnn_type == "lstm": extensions.append(VisualizeGateLSTM(gate_values, updates, args.dataset, ploting_path=None)) elif args.rnn_type == "soft": extensions.append(VisualizeGateSoft(gate_values, updates, args.dataset, ploting_path=None)) else: assert(False) extensions.append(early_stopping) extensions.append(Printing(every_n_batches=args.monitoring_freq)) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()
def train(ladder, batch_size=100, labeled_samples=100, unlabeled_samples=50000, valid_set_size=10000, num_epochs=150, valid_batch_size=100, lrate_decay=0.67, save_path='results/mnist_100_full0'): # Setting Logger log_path = os.path.join(save_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % log_path) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr)) # Fetch all batch normalization updates. They are in the clean path. # In addition to actual training, also do BN variable approximations bn_updates = ComputationGraph([ladder.costs.class_clean]).updates training_algorithm.add_updates(bn_updates) monitored_variables = [ ladder.costs.class_corr, ladder.costs.class_clean, ladder.error, training_algorithm.total_gradient_norm, ladder.costs.total] + ladder.costs.denois.values() data = get_mnist_data_dict(unlabeled_samples=unlabeled_samples, valid_set_size=valid_set_size) train_data_stream = make_datastream( data.train, data.train_ind, batch_size, n_labeled=labeled_samples, n_unlabeled=unlabeled_samples) valid_data_stream = make_datastream( data.valid, data.valid_ind, valid_batch_size, n_labeled=len(data.valid_ind), n_unlabeled=len(data.valid_ind)) train_monitoring = TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring( variables=monitored_variables, data_stream=valid_data_stream, prefix="valid", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=Model(ladder.costs.total), extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams(None, all_params, save_path, after_epoch=True), SaveLog(save_path, after_training=True), LRDecay(lr=ladder.lr, decay_first=num_epochs * lrate_decay, decay_last=num_epochs, after_epoch=True), Printing()]) main_loop.run()
batch_size=m))) # uwaga: dyskryminator bierze 2m probek, pierwsze m to generowane, kolejne m to z danych # observables.append(cost_discriminator) generator_cost = theano.shared(value=np.array(0., dtype=np.float32), name='g_cost') discriminator_cost = theano.shared(value=np.array(0., dtype=np.float32), name='d_cost') generator_step_norm = theano.shared(value=np.array(0., dtype=np.float32), name='g_step_norm') generator_grad_norm = theano.shared(value=np.array(0., dtype=np.float32), name='g_grad_norm') discriminator_step_norm = theano.shared(value=np.array(0., dtype=np.float32), name='d_step_norm') discriminator_grad_norm = theano.shared(value=np.array(0., dtype=np.float32), name='d_grad_norm') discriminator_descent.add_updates([ (discriminator_cost, ComputationGraph(cost_discriminator).outputs[0]), (discriminator_step_norm, discriminator_descent.total_step_norm), (discriminator_grad_norm, discriminator_descent.total_gradient_norm)]) generator_descent.add_updates([ (generator_cost, ComputationGraph(cost_generator).outputs[0]), (generator_step_norm, generator_descent.total_step_norm), (generator_grad_norm, generator_descent.total_gradient_norm)]) observables = [] observables.append(generator_cost) observables.append(discriminator_cost) observables.append(generator_step_norm) observables.append(generator_grad_norm) observables.append(discriminator_step_norm) observables.append(discriminator_grad_norm)
# Build datastream train_stream = datastream.setup_datastream(config.dataset, config.num_seqs, config.seq_len, config.seq_div_size) # Build model m = config.Model(config) # Train the model cg = Model(m.sgd_cost) algorithm = GradientDescent(cost=m.sgd_cost, step_rule=config.step_rule, parameters=cg.parameters) algorithm.add_updates(m.states) monitor_vars = list(set(v for p in m.monitor_vars for v in p)) extensions = [ ProgressBar(), TrainingDataMonitoring( monitor_vars, prefix='train', every_n_batches=config.monitor_freq), Printing(every_n_batches=config.monitor_freq, after_epoch=False), ResetStates([v for v, _ in m.states], after_epoch=True) ] if plot_avail: plot_channels = [['train_' + v.name for v in p] for p in m.monitor_vars] extensions.append( Plot(document='text_'+model_name,
def train_model(cost, unregularized_cost, updates, train_stream, valid_stream, args, gate_values=None): step_rule = learning_algorithm(args) cg = ComputationGraph(cost) # ADD REGULARIZATION # WEIGHT NOISE weight_noise = args.weight_noise if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg_train = apply_noise(cg, weights, weight_noise) cost = cg_train.outputs[0] cost.name = "cost_with_weight_noise" cg = ComputationGraph(cost) logger.info(cg.parameters) # Define algorithm algorithm = GradientDescent(cost=cost, step_rule=step_rule, parameters=cg.parameters) # Add the updates to carry the hidden state algorithm.add_updates(updates) # Extensions to be added extensions = [] # Load from a dumped model if args.load_path is not None: if args.fine_tuning: cost = fine_tuning(cost, args) else: extensions.append(Load(args.load_path)) # Generation extension if args.generate: extensions.append( TextGenerationExtension( cost=cost, generation_length=args.generated_text_lenght, initial_text_length=args.initial_text_length, every_n_batches=1, ploting_path=os.path.join(args.save_path, 'prob_plot.png'), softmax_sampling=args.softmax_sampling, dataset=args.dataset, updates=updates, interactive_mode=args.interactive_mode)) # Training and Validation score monitoring extensions.extend([ TrainingDataMonitoring([cost], prefix='train', every_n_batches=args.monitoring_freq), DataStreamMonitoring([cost, unregularized_cost], valid_stream, args.mini_batch_size_valid, args.dataset, state_updates=updates, prefix='valid', before_first_epoch=(args.visualize is None), every_n_batches=args.monitoring_freq) ]) # Creating directory for saving model. if not args.interactive_mode: if not os.path.exists(args.save_path): os.makedirs(args.save_path) elif 'test' in args.save_path: print("Rewriting in " + args.save_path) else: raise Exception('Directory already exists') # Early stopping extensions.append( EarlyStopping('valid_' + unregularized_cost.name, args.patience, args.save_path, every_n_batches=args.monitoring_freq)) # Printing extensions.append(ProgressBar()) extensions.append(Printing(every_n_batches=args.monitoring_freq)) # Reset the initial states if args.dataset == "sine": reset_frequency = 1 else: reset_frequency = 100 extensions.append( ResetStates([v for v, _ in updates], every_n_batches=reset_frequency)) # Visualizing extensions if args.interactive_mode: extensions.append(InteractiveMode()) main_loop = MainLoop(model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions) # This is where the magic happens! main_loop.run()
def train_rnnrbm(train, rnnrbm, epochs=1000, test=None, bokeh=True, load_path=None): cdk = theano.shared(10) lr = theano.shared(float32(0.004)) cost, v_sample = rnnrbm.cost(examples=x, mask=x_mask, k=cdk) error_rate = MismulitclassificationRate().apply(x, v_sample[-1], x_mask) error_rate.name = "error on note as a whole" mistake_rate = MismulitmistakeRate().apply(x, v_sample[-1], x_mask) mistake_rate.name = "single error within note" cost.name = 'rbm_cost' model = Model(cost) cg = ComputationGraph([cost]) step_rule = CompositeRule([ RemoveNotFinite(), StepClipping(30.0), Adam(learning_rate=lr), StepClipping(6.0), RemoveNotFinite() ]) # Scale(0.01) gradients = dict( equizip(cg.parameters, T.grad(cost, cg.parameters, consider_constant=[v_sample]))) algorithm = GradientDescent(step_rule=step_rule, gradients=gradients, cost=cost, params=cg.parameters) algorithm.add_updates(cg.updates) extensions = [ SharedVariableModifier(parameter=cdk, function=lambda n, v: rnnrbm_cdk[n] if rnnrbm_cdk.get(n) else v), SharedVariableModifier(parameter=lr, function=lambda n, v: float32(0.78 * v) if n % (200 * 5) == 0 else v), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring( [ cost, error_rate, mistake_rate, ], # hidden_states, debug_val, param_nans, # aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=False, every_n_batches=40), Timing(), Printing(), ProgressBar() ] if test is not None: extensions.append( DataStreamMonitoring([cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test", after_epoch=False, every_n_batches=40)) if bokeh: extensions.append( Plot( 'Training RNN-RBM', channels=[ [ 'train_error on note as a whole', 'train_single error within note', 'test_error on note as a whole', 'test_single error within note' ], ['train_final_cost'], # ['train_total_gradient_norm'], ])) main_loop = MainLoop(algorithm=algorithm, data_stream=train, model=model, extensions=extensions) return main_loop
def train(cli_params): cli_params['save_dir'] = prepare_dir(cli_params['save_to']) logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data = setup_data(p, test_set=False) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim, ) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert 'counter' in [u.name for u in list(bn_updates.keys())], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr)) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) short_prints = { "train": { 'T_C_class': ladder.costs.class_corr, 'T_C_de': list(ladder.costs.denois.values()), }, "valid_approx": OrderedDict([ ('V_C_class', ladder.costs.class_clean), ('V_E', ladder.error.clean), ('V_C_de', list(ladder.costs.denois.values())), ]), "valid_final": OrderedDict([ ('VF_C_class', ladder.costs.class_clean), ('VF_E', ladder.error.clean), ('VF_C_de', list(ladder.costs.denois.values())), ]), } main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples), model=Model(theano.tensor.cast(ladder.costs.total, "float32")), extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring( [ladder.costs.class_clean, ladder.error.clean] + list(ladder.costs.denois.values()), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, scheme=ShuffledScheme), prefix="valid_approx"), TrainingDataMonitoring([ ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm ] + list(ladder.costs.denois.values()), prefix="train", after_epoch=True), SaveParams(None, all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ]) main_loop.run() # Get results df = main_loop.log.to_dataframe() col = 'valid_final_error_rate_clean' logger.info('%s %g' % (col, df[col].iloc[-1])) if main_loop.log.status['epoch_interrupt_received']: return None return df
def train_model(cost, unregularized_cost, updates, train_stream, valid_stream, args, gate_values=None): step_rule = learning_algorithm(args) cg = ComputationGraph(cost) # ADD REGULARIZATION # WEIGHT NOISE weight_noise = args.weight_noise if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg_train = apply_noise(cg, weights, weight_noise) cost = cg_train.outputs[0] cost.name = "cost_with_weight_noise" cg = ComputationGraph(cost) logger.info(cg.parameters) # Define algorithm algorithm = GradientDescent(cost=cost, step_rule=step_rule, parameters=cg.parameters) # Add the updates to carry the hidden state algorithm.add_updates(updates) # Extensions to be added extensions = [] # Load from a dumped model if args.load_path is not None: extensions.append(Load(args.load_path)) # Generation extension if args.generate: extensions.append(TextGenerationExtension( cost=cost, generation_length=args.generated_text_lenght, initial_text_length=args.initial_text_length, every_n_batches=1, ploting_path=os.path.join(args.save_path, 'prob_plot.png'), softmax_sampling=args.softmax_sampling, dataset=args.dataset, updates=updates, interactive_mode=args.interactive_mode)) # Training and Validation score monitoring extensions.extend([ TrainingDataMonitoring([cost], prefix='train', every_n_batches=args.monitoring_freq), DataStreamMonitoring([cost, unregularized_cost], valid_stream, args.mini_batch_size_valid, args.dataset, state_updates=updates, prefix='valid', before_first_epoch=(args.visualize == "nothing"), every_n_batches=args.monitoring_freq)]) # Creating directory for saving model. if not args.interactive_mode: if not os.path.exists(args.save_path): os.makedirs(args.save_path) elif 'test' in args.save_path: print "Rewriting in " + args.save_path else: raise Exception('Directory already exists') # Early stopping extensions.append(EarlyStopping('valid_' + unregularized_cost.name, args.patience, args.save_path, every_n_batches=args.monitoring_freq)) # Printing extensions.append(ProgressBar()) extensions.append(Printing(every_n_batches=args.monitoring_freq)) # Reset the initial states if args.dataset == "sine": reset_frequency = 1 else: reset_frequency = 100 extensions.append(ResetStates([v for v, _ in updates], every_n_batches=reset_frequency)) # Visualizing extensions if args.interactive_mode: extensions.append(InteractiveMode()) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()
def main(nvis, nhid, encoding_lstm_dim, decoding_lstm_dim, T=1): x = tensor.matrix('features') # Construct and initialize model encoding_mlp = MLP([Tanh()], [None, None]) decoding_mlp = MLP([Tanh()], [None, None]) encoding_lstm = LSTM(dim=encoding_lstm_dim) decoding_lstm = LSTM(dim=decoding_lstm_dim) draw = DRAW(nvis=nvis, nhid=nhid, T=T, encoding_mlp=encoding_mlp, decoding_mlp=decoding_mlp, encoding_lstm=encoding_lstm, decoding_lstm=decoding_lstm, biases_init=Constant(0), weights_init=Orthogonal()) draw.push_initialization_config() encoding_lstm.weights_init = IsotropicGaussian(std=0.001) decoding_lstm.weights_init = IsotropicGaussian(std=0.001) draw.initialize() # Compute cost cost = -draw.log_likelihood_lower_bound(x).mean() cost.name = 'nll_upper_bound' model = Model(cost) # Datasets and data streams mnist_train = BinarizedMNIST('train') train_loop_stream = ForceFloatX(DataStream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 100))) train_monitor_stream = ForceFloatX(DataStream( dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 500))) mnist_valid = BinarizedMNIST('valid') valid_monitor_stream = ForceFloatX(DataStream( dataset=mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500))) mnist_test = BinarizedMNIST('test') test_monitor_stream = ForceFloatX(DataStream( dataset=mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 500))) # Get parameters and monitoring channels computation_graph = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(computation_graph.variables) monitoring_channels = dict([ ('avg_' + channel.tag.name, channel.mean()) for channel in VariableFilter(name='.*term$')(computation_graph.auxiliary_variables)]) for name, channel in monitoring_channels.items(): channel.name = name monitored_quantities = monitoring_channels.values() + [cost] # Training loop step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) algorithm.add_updates(computation_graph.updates) main_loop = MainLoop( model=model, data_stream=train_loop_stream, algorithm=algorithm, extensions=[ Timing(), SerializeMainLoop('vae.pkl', save_separately=['model']), FinishAfter(after_n_epochs=200), DataStreamMonitoring( monitored_quantities, train_monitor_stream, prefix="train", updates=computation_graph.updates), DataStreamMonitoring( monitored_quantities, valid_monitor_stream, prefix="valid", updates=computation_graph.updates), DataStreamMonitoring( monitored_quantities, test_monitor_stream, prefix="test", updates=computation_graph.updates), ProgressBar(), Printing()]) main_loop.run()
def train(ladder, batch_size=100, num_train_examples=60000, num_epochs=150, lrate_decay=0.67): # Setting Logger timestr = time.strftime("%Y_%m_%d_at_%H_%M") save_path = 'results/mnist_100_standard_' + timestr log_path = os.path.join(save_path, 'log.txt') os.makedirs(save_path) fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) # Training model = Model(ladder.costs.total) all_params = model.parameters print len(all_params) print all_params training_algorithm = GradientDescent( cost=ladder.costs.total, parameters=all_params, step_rule=Adam(learning_rate=ladder.lr)) # Fetch all batch normalization updates. They are in the clean path. # In addition to actual training, also do BN variable approximations bn_updates = ComputationGraph([ladder.costs.class_clean]).updates training_algorithm.add_updates(bn_updates) monitored_variables = [ ladder.costs.class_corr, ladder.costs.class_clean, ladder.error, training_algorithm.total_gradient_norm, ladder.costs.total] + ladder.costs.denois.values() train_data_stream, test_data_stream = get_mixed_streams( batch_size) train_monitoring = TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring( variables=monitored_variables, data_stream=test_data_stream, prefix="test", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('test_CE_corr', model, save_path), SaveLog(save_path, after_epoch=True), LRDecay(lr=ladder.lr, decay_first=num_epochs * lrate_decay, decay_last=num_epochs, after_epoch=True), Printing()]) main_loop.run()
graphs, extensions, updates = construct_graphs(args, nclasses, sequence_length) #graph, extension, update = construct_graphs(args, nclasses, sequence_length) ### optimization algorithm definition step_rule = CompositeRule([ StepClipping(1.), #Momentum(learning_rate=args.learning_rate, momentum=0.9), RMSProp(learning_rate=args.learning_rate, decay_rate=0.5), ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor (after epoch to limit the log size) step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ]) step_channels.append( algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append( algorithm.total_gradient_norm.copy(name="total_gradient_norm")) step_channels.extend(graphs["training"].outputs) logger.warning("constructing training data monitor")
def train(cli_params): cli_params['save_dir'] = prepare_dir(cli_params['save_to']) logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=True) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim,) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr)) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) model=Model(ladder.costs.total) monitored_variables = [ ladder.costs.class_corr, ladder.costs.class_clean, ladder.error, # training_algorithm.total_gradient_norm, ladder.costs.total] \ # + ladder.costs.denois.values() # Make a global history recorder so that we can get summary at end of # training when we write to Sentinel # global_history records all relevant monitoring vars # updated by SaveLog every time global_history = {} main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm), model=model, extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring( monitored_variables, make_datastream(data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( monitored_variables, make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), # DEPREC: we directly test now # make_datastream(data.valid, data.valid_ind, # p.valid_batch_size, # n_labeled=len(data.valid_ind), # whiten=whiten, cnorm=cnorm, # scheme=ShuffledScheme), # prefix="valid_final", make_datastream(data.test, data.test_ind, p.batch_size, n_labeled=len(data.test_ind), whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme), prefix="final_test", after_n_epochs=p.num_epochs), TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True), # write out to sentinel file for experiment automator to work # REMOVE THIS if you're running test mode with early stopping immediately after SentinelWhenFinish(save_dir=p.save_dir, global_history=global_history), # originally use 'valid_approx_cost_class_clean' # turns out should use ER as early stopping # use CE as a fallback (secondary early stopvar) if ER is the same # SaveParams(('valid_approx_error_rate', 'valid_approx_cost_class_clean'), # model, p.save_dir), # doesn't do early stopping now SaveParams(None, model, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(save_dir=p.save_dir, after_epoch=True, global_history=global_history), Printing(), # ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ]) main_loop.run() # ================= Add testing at end of training ================= # DEPREC don't do early stopping anymore if False: p.load_from = p.save_dir ladder = setup_model(p) logger.info('Start testing on trained_params_best') main_loop = DummyLoop( extensions=[ # write to global history SaveLog(save_dir=p.save_dir, after_training=True, global_history=global_history), # write out to sentinel file for experiment automator to work SentinelWhenFinish(save_dir=p.save_dir, global_history=global_history), FinalTestMonitoring( [ladder.costs.class_clean, ladder.error] + ladder.costs.denois.values(), make_datastream(data.train, data.train_ind, # These need to match with the training p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=len(data.train_ind), cnorm=cnorm, whiten=whiten, scheme=ShuffledScheme), make_datastream(data.test, data.test_ind, p.batch_size, n_labeled=len(data.test_ind), n_unlabeled=len(data.test_ind), cnorm=cnorm, whiten=whiten, scheme=ShuffledScheme), prefix="test", before_training=True) ]) main_loop.run() # Get results df = main_loop.log.to_dataframe() # col = 'valid_final_error_rate' # logger.info('%s %g' % (col, df[col].iloc[-1])) if main_loop.log.status['epoch_interrupt_received']: return None return df
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): if name is None: tag = "watt" if attention else "woatt" name = "%s-t%d-enc%d-dec%d-z%d" % (tag, n_iter, enc_dim, dec_dim, z_dim) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #------------------------------------------------------------------------ x_dim = 28 * 28 img_height, img_width = (28, 28) rnninits = { 'weights_init': Orthogonal(), #'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } inits = { 'weights_init': Orthogonal(), #'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } prior_mu = T.zeros([z_dim]) prior_log_sigma = T.zeros([z_dim]) if attention: read_N = 4 write_N = 6 read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=read_N, **inits) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) encoder = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Tanh()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Tanh()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) for brick in [ reader, writer, encoder, decoder, encoder_mlp, decoder_mlp, q_sampler ]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') # This is one iteration def one_iteration(c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec, x): x_hat = x - T.nnet.sigmoid(c) r = reader.apply(x, x_hat, h_dec) i_enc = encoder_mlp.apply(T.concatenate([r, h_dec], axis=1)) h_enc, c_enc = encoder.apply(states=h_enc, cells=c_enc, inputs=i_enc, iterate=False) z_mean, z_log_sigma, z = q_sampler.apply(h_enc) i_dec = decoder_mlp.apply(z) h_dec, c_dec = decoder.apply(states=h_dec, cells=c_dec, inputs=i_dec, iterate=False) c = c + writer.apply(h_dec) return c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec outputs_info = [ T.zeros([batch_size, x_dim]), # c T.zeros([batch_size, enc_dim]), # h_enc T.zeros([batch_size, enc_dim]), # c_enc T.zeros([batch_size, z_dim]), # z_mean T.zeros([batch_size, z_dim]), # z_log_sigma T.zeros([batch_size, z_dim]), # z T.zeros([batch_size, dec_dim]), # h_dec T.zeros([batch_size, dec_dim]), # c_dec ] outputs, scan_updates = theano.scan(fn=one_iteration, sequences=[], outputs_info=outputs_info, non_sequences=[x], n_steps=n_iter) c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec = outputs kl_terms = (prior_log_sigma - z_log_sigma + 0.5 * (tensor.exp(2 * z_log_sigma) + (z_mean - prior_mu)**2) / tensor.exp(2 * prior_log_sigma) - 0.5).sum(axis=-1) x_recons = T.nnet.sigmoid(c[-1, :, :]) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ #StepClipping(3.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t x_recons_t = T.nnet.sigmoid(c[t, :, :]) recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) recons_term_t = recons_term_t.mean() recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t, recons_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], ["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"]] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=None, data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), updates=scan_updates, prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def train_snli_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed, model='simple'): if config['exclude_top_k'] > config['num_input_words'] and config[ 'num_input_words'] > 0: raise Exception("Some words have neither word nor def embedding") c = config logger = configure_logger(name="snli_baseline_training", log_file=os.path.join(save_path, "log.txt")) if not os.path.exists(save_path): logger.info("Start a new job") os.mkdir(save_path) else: logger.info("Continue an existing job") with open(os.path.join(save_path, "cmd.txt"), "w") as f: f.write(" ".join(sys.argv)) # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) main_loop_path = os.path.join(save_path, 'main_loop.tar') main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar') stream_path = os.path.join(save_path, 'stream.pkl') # Save config to save_path json.dump(config, open(os.path.join(save_path, "config.json"), "w")) if model == 'simple': nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data( c) elif model == 'esim': nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data( c) else: raise NotImplementedError() # Compute cost s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: assert os.path.exists(c['dict_path']) s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') y = T.ivector('label') cg = {} for train_phase in [True, False]: # NOTE: Please don't change outputs of cg if train_phase: with batch_normalization(nli_model): pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) else: pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) cost = CategoricalCrossEntropy().apply(y.flatten(), pred) error_rate = MisclassificationRate().apply(y.flatten(), pred) cg[train_phase] = ComputationGraph([cost, error_rate]) # Weight decay (TODO: Make it less bug prone) if model == 'simple': weights_to_decay = VariableFilter( bricks=[dense for dense, relu, bn in nli_model._mlp], roles=[WEIGHT])(cg[True].variables) weight_decay = np.float32(c['l2']) * sum( (w**2).sum() for w in weights_to_decay) elif model == 'esim': weight_decay = 0.0 else: raise NotImplementedError() final_cost = cg[True].outputs[0] + weight_decay final_cost.name = 'final_cost' # Add updates for population parameters if c.get("bn", True): pop_updates = get_batch_normalization_updates(cg[True]) extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates] else: pop_updates = [] extra_updates = [] if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: loaded_params = load_parameters(src) cg[True].set_parameter_values(loaded_params) for param, m in pop_updates: param.set_value(loaded_params[get_brick( param).get_hierarchical_name(param)]) if os.path.exists(os.path.join(save_path, "main_loop.tar")): logger.warning("Manually loading BN stats :(") with open(os.path.join(save_path, "main_loop.tar")) as src: loaded_params = load_parameters(src) for param, m in pop_updates: param.set_value( loaded_params[get_brick(param).get_hierarchical_name(param)]) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4).get_epoch_iterator()) s1.tag.test_value = test_value_data[0] s1_mask.tag.test_value = test_value_data[1] s2.tag.test_value = test_value_data[2] s2_mask.tag.test_value = test_value_data[3] y.tag.test_value = test_value_data[4] # Freeze embeddings if not c['train_emb']: frozen_params = [ p for E in nli_model.get_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params) & set(train_params)) > 0 else: frozen_params = [] if not c.get('train_def_emb', 1): frozen_params_def = [ p for E in nli_model.get_def_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params_def) & set(train_params)) > 0 frozen_params += frozen_params_def train_params = [p for p in cg[True].parameters if p not in frozen_params] train_params_keys = [ get_brick(p).get_hierarchical_name(p) for p in train_params ] # Optimizer algorithm = GradientDescent(cost=final_cost, on_unused_sources='ignore', parameters=train_params, step_rule=Adam(learning_rate=c['lr'])) algorithm.add_updates(extra_updates) m = Model(final_cost) parameters = m.get_parameter_dict() # Blocks version mismatch logger.info("Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(train_params_keys)], width=120)) logger.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted(train_params_keys) ]))) ### Monitored args ### train_monitored_vars = [final_cost] + cg[True].outputs monitored_vars = cg[False].outputs val_acc = monitored_vars[1] to_monitor_names = [ 'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2', 's1_gate_rootmean2', 's1_compose_gate_rootmean2' ] for k in to_monitor_names: train_v, valid_v = VariableFilter(name=k)( cg[True]), VariableFilter(name=k)(cg[False]) if len(train_v): logger.info("Adding {} tracking".format(k)) train_monitored_vars.append(train_v[0]) monitored_vars.append(valid_v[0]) else: logger.warning("Didnt find {} in cg".format(k)) if c['monitor_parameters']: for name in train_params_keys: param = parameters[name] num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements grad_norm = algorithm.gradients[param].norm(2) / num_elements step_norm = algorithm.steps[param].norm(2) / num_elements stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' train_monitored_vars.append(stats) regular_training_stream = data.get_stream('train', batch_size=c['batch_size'], seed=seed) if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=regular_training_stream.sources, hwm=100, produces_examples=regular_training_stream.produces_examples) else: training_stream = regular_training_stream ### Build extensions ### extensions = [ # Load(main_loop_path, load_iteration_state=True, load_log=True) # .set_conditions(before_training=not new_training_job), StartFuelServer(regular_training_stream, stream_path, hwm=100, script_path=os.path.join( os.path.dirname(__file__), "../bin/start_fuel_server.py"), before_training=fuel_server), Timing(every_n_batches=c['mon_freq']), ProgressBar(), RetrievalPrintStats(retrieval=used_retrieval, every_n_batches=c['mon_freq_valid'], before_training=not fast_start), Timestamp(), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq']), ] if c['layout'] == 'snli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid', batch_size=14, seed=seed), before_training=not fast_start, on_resumption=True, after_training=True, every_n_batches=c['mon_freq_valid'], prefix='valid') extensions.append(validation) elif c['layout'] == 'mnli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid_matched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], on_resumption=True, after_training=True, prefix='valid_matched') validation_mismatched = DataStreamMonitoring( monitored_vars, data.get_stream('valid_mismatched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], before_training=not fast_start, on_resumption=True, after_training=True, prefix='valid_mismatched') extensions.extend([validation, validation_mismatched]) else: raise NotImplementedError() # Similarity trackers for embeddings if len(c.get('vocab_def', '')): retrieval_vocab = Vocabulary(c['vocab_def']) else: retrieval_vocab = data.vocab retrieval_all = Retrieval(vocab_text=retrieval_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) for name in [ 's1_word_embeddings', 's1_dict_word_embeddings', 's1_translated_word_embeddings' ]: variables = VariableFilter(name=name)(cg[False]) if len(variables): s1_emb = variables[0] logger.info("Adding similarity tracking for " + name) # A bit sloppy about downcast if "dict" in name: embedder = construct_dict_embedder(theano.function( [s1, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), vocab=data.vocab, retrieval=retrieval_all) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) else: embedder = construct_embedder(theano.function( [s1], s1_emb, allow_input_downcast=True), vocab=data.vocab) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) track_the_best = TrackTheBest(validation.record_name(val_acc), before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start, every_n_batches=c['mon_freq_valid'], choose_best=min) extensions.append(track_the_best) # Special care for serializing embeddings if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path', '')): extensions.insert( 0, LoadNoUnpickling(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=train_params + [p for p, m in pop_updates], save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) else: extensions.insert( 0, Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=cg[True].parameters + [p for p, m in pop_updates], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) extensions.extend([ DumpCSVSummaries(save_path, every_n_batches=c['mon_freq_valid'], after_training=True), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_valid'], after_training=True), Printing(every_n_batches=c['mon_freq_valid']), PrintMessage(msg="save_path={}".format(save_path), every_n_batches=c['mon_freq']), FinishAfter(after_n_batches=c['n_batches']).add_condition( ['after_batch'], OnLogStatusExceed('iterations_done', c['n_batches'])) ]) logger.info(extensions) ### Run training ### if "VISDOM_SERVER" in os.environ: print("Running visdom server") ret = subprocess.Popen([ os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"), "--visdom-server={}".format(os.environ['VISDOM_SERVER']), "--folder={}".format(save_path) ]) time.sleep(0.1) if ret.returncode is not None: raise Exception() atexit.register(lambda: os.kill(ret.pid, signal.SIGINT)) model = Model(cost) for p, m in pop_updates: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p main_loop = MainLoop(algorithm, training_stream, model=model, extensions=extensions) assert os.path.exists(save_path) main_loop.run()
def main(save_to, num_epochs, weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 prior_noise_level = -10 noise_step_rule = Scale(1e-6) noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX)) convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True, noise_rate=noise_rate, prior_noise_level=prior_noise_level) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): with training_noise(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # for annealing nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' # Compute noise rates for training graph train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean() train_mean_log_sigma.name = 'mean_log_sigma' train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean() train_nit_rate.name = 'nit_rate' train_nit_regularization = nit_penalty * train_nit_rate train_nit_regularization.name = 'nit_regularization' # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) mask_parameters = [p for p in trainable_parameters if get_brick(p).name == 'mask'] noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) nonmask_weights = [p for p in weights if get_brick(p).name != 'mask'] l2_norm = sum([(W ** 2).sum() for W in nonmask_weights]) l2_norm.name = 'l2_norm' l2_regularization = weight_decay * l2_norm l2_regularization.name = 'l2_regularization' # testversion test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + l2_regularization + train_nit_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 128 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # Create a step rule that reduces the learning rate of noise scale_mask = Restrict(noise_step_rule, mask_parameters) step_rule = CompositeRule([scale_mask, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) exp_name = save_to.replace('.%d', '') # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (50, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), EpochSchedule(noise_step_rule.learning_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4) ]), EpochSchedule(noise_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4), # (6, 3e-4), # (8, 1e-3), # Causes nit rate to jump # (10, 3e-3), # (12, 1e-2), # (15, 3e-2), # (19, 1e-1), # (24, 3e-1), # (30, 1) ]), NoiseExtension( noise_parameters=noise_parameters), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_regularization, train_nit_regularization, momentum.learning_rate, train_mean_log_sigma, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + exp_name, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_nit_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ['train_mean_log_sigma'], ], every_n_batches=17), Plot('Test performance for ' + exp_name, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(exp_name, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def run(model_name, port_train, port_valid): running_on_laptop = socket.gethostname() == 'yop' X = tensor.tensor4('image_features', dtype='float32') T = tensor.matrix('targets', dtype='float32') image_border_size = (100, 100) if running_on_laptop: host_plot = 'http://*****:*****@ %s' % (model_name, datetime.datetime.now(), socket.gethostname()), channels=[['loss'], ['error', 'valid_error']], after_epoch=True, server_url=host_plot), Printing(), Checkpoint('/tmp/train_bn2') ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions, model=model) main_loop.run()
mean_sigma, min_sigma, min_mu, max_mu, mean_mu, max_sigma, mean_coeff, min_coeff, max_coeff ] ################# # Algorithm ################# n_batches = 200 algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0), Adam(lr)])) algorithm.add_updates(extra_updates) train_monitor = TrainingDataMonitoring(variables=monitoring_variables + data_monitoring, every_n_batches=n_batches, prefix="train") valid_monitor = DataStreamMonitoring(monitoring_variables, valid_stream, every_n_batches=n_batches, prefix="valid") extensions = [ ProgressBar(), Timing(every_n_batches=n_batches), train_monitor, valid_monitor, TrackTheBest('valid_nll', every_n_batches=n_batches),
def run(discriminative_regularization=True): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs(discriminative_regularization) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp' ))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring monitored_quantities_list = [] for graph in [bn_cg, cg]: cost, kl_term, reconstruction_term = graph.outputs cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term]) train_monitoring = DataStreamMonitoring(monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=False, every_n_epochs=5) valid_monitoring = DataStreamMonitoring(monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=False, every_n_epochs=5) # Prepare checkpoint save_path = 'celeba_vae_{}regularization.zip'.format( '' if discriminative_regularization else 'no_') checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True) extensions = [ Timing(), FinishAfter(after_n_epochs=75), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar() ] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
min_binary = binary.min().copy(name="binary_min") mean_binary = binary.mean().copy(name="binary_mean") max_binary = binary.max().copy(name="binary_max") data_monitoring += [mean_sigma, min_sigma, min_mu, max_mu, mean_mu, max_sigma, mean_binary, min_binary, max_binary] ################# # Algorithm ################# n_batches = 200 algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Adam(lr)]) ) algorithm.add_updates(extra_updates) train_monitor = TrainingDataMonitoring( variables=monitoring_variables + data_monitoring, every_n_batches=n_batches, prefix="train" ) valid_monitor = DataStreamMonitoring( monitoring_variables + data_monitoring, valid_stream, every_n_batches=n_batches, prefix="valid" ) extensions = [ ProgressBar(), Timing(every_n_batches=n_batches), train_monitor, valid_monitor, TrackTheBest("valid_nll", every_n_batches=n_batches),
# # cost += reg * abs(p).sum() # cost += reg * (p ** 2).sum() # param_nans += T.isnan(p).sum() # name = params[i].name # params[i] = params[i].mean() # params[i].name = name + str(i) cost.name = "final_cost" # param_nans.name = 'params_nans' ## Training algorithm # algorithm.initialize() algorithm.add_updates(cg.updates) extensions = [FinishAfter(after_n_epochs=5000), DataStreamMonitoring( [cost, error_rate, mistake_rate], data_stream=test, updates=cg.updates, prefix="test"), TrainingDataMonitoring( [cost, # hidden_states, debug_val, param_nans, aggregation.mean(algorithm.total_gradient_norm)], #+ params, prefix="train", after_epoch=True), Timing(), Printing(), ProgressBar()]
main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses, sequence_length) ### optimization algorithm definition step_rule = CompositeRule([ StepClipping(1.), #Momentum(learning_rate=args.learning_rate, momentum=0.9), RMSProp(learning_rate=args.learning_rate, decay_rate=0.5), ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor (after epoch to limit the log size) step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items()]) step_channels.append(algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append(algorithm.total_gradient_norm.copy(name="total_gradient_norm")) step_channels.extend(graphs["training"].outputs) logger.warning("constructing training data monitor") extensions.append(TrainingDataMonitoring( step_channels, prefix="iteration", after_batch=False))
def train(cli_params): cli_params["save_dir"] = prepare_dir(cli_params["save_to"]) logfile = os.path.join(cli_params["save_dir"], "log.txt") # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info("Logging into %s" % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=False) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim,) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info("Found the following parameters: %s" % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert "counter" in [u.name for u in bn_updates.keys()], "No batch norm params in graph - the graph has been cut?" training_algorithm = GradientDescent( cost=ladder.costs.total, params=all_params, step_rule=Adam(learning_rate=ladder.lr) ) # In addition to actual training, also do BN variable approximations training_algorithm.add_updates(bn_updates) model = Model(ladder.costs.total) monitored_variables = [ ladder.costs.class_corr, ladder.costs.class_clean, ladder.error, # training_algorithm.total_gradient_norm, ladder.costs.total, ] # + ladder.costs.denois.values() # Make a global history recorder so that we can get summary at end of # training when we write to Sentinel # global_history records all relevant monitoring vars # updated by SaveLog every time global_history = {} main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream( data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm, ), model=model, extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # write out to sentinel file for experiment automator to work SentinelWhenFinish(save_dir=p.save_dir, global_history=global_history), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring( monitored_variables, make_datastream( data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme ), prefix="valid_approx", ), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( monitored_variables, make_datastream( data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme, ), make_datastream( data.valid, data.valid_ind, p.valid_batch_size, n_labeled=len(data.valid_ind), whiten=whiten, cnorm=cnorm, scheme=ShuffledScheme, ), prefix="valid_final", after_n_epochs=p.num_epochs, ), TrainingDataMonitoring(variables=monitored_variables, prefix="train", after_epoch=True), SaveParams("valid_approx_cost_class_corr", model, p.save_dir), # SaveParams(None, all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(save_dir=p.save_dir, after_epoch=True, global_history=global_history), Printing(), # ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, after_epoch=True), ], ) main_loop.run() # Get results df = main_loop.log.to_dataframe() col = "valid_final_error_rate" logger.info("%s %g" % (col, df[col].iloc[-1])) if main_loop.log.status["epoch_interrupt_received"]: return None return df