def test_track_the_best(): main_loop = MockMainLoop() extension = TrackTheBest("cost") extension.main_loop = main_loop main_loop.status["epochs_done"] += 1 main_loop.status["iterations_done"] += 10 main_loop.log.current_row["cost"] = 5 extension.dispatch("after_epoch") assert main_loop.status["best_cost"] == 5 assert main_loop.log.current_row["cost_best_so_far"] main_loop.status["epochs_done"] += 1 main_loop.status["iterations_done"] += 10 main_loop.log.current_row["cost"] = 6 extension.dispatch("after_epoch") assert main_loop.status["best_cost"] == 5 assert main_loop.log.current_row.get("cost_best_so_far", None) is None main_loop.status["epochs_done"] += 1 main_loop.status["iterations_done"] += 10 main_loop.log.current_row["cost"] = 5 extension.dispatch("after_epoch") assert main_loop.status["best_cost"] == 5 assert main_loop.log.current_row.get("cost_best_so_far", None) is None main_loop.status["epochs_done"] += 1 main_loop.status["iterations_done"] += 10 main_loop.log.current_row["cost"] = 4 extension.dispatch("after_epoch") assert main_loop.status["best_cost"] == 4 assert main_loop.log.current_row["cost_best_so_far"]
def test_track_the_best(): main_loop = MockMainLoop() extension = TrackTheBest("cost") extension.main_loop = main_loop main_loop.status['iterations_done'] += 1 main_loop.log.current_row['cost'] = 5 extension.dispatch('after_batch') assert main_loop.status['best_cost'] == 5 assert main_loop.log.current_row['cost_best_so_far'] main_loop.status['iterations_done'] += 1 main_loop.log.current_row['cost'] = 6 extension.dispatch('after_batch') assert main_loop.status['best_cost'] == 5 assert main_loop.log.current_row.get('cost_best_so_far', None) is None main_loop.status['iterations_done'] += 1 main_loop.log.current_row['cost'] = 5 extension.dispatch('after_batch') assert main_loop.status['best_cost'] == 5 assert main_loop.log.current_row.get('cost_best_so_far', None) is None main_loop.status['iterations_done'] += 1 main_loop.log.current_row['cost'] = 4 extension.dispatch('after_batch') assert main_loop.status['best_cost'] == 4 assert main_loop.log.current_row['cost_best_so_far']
def test_save_the_best(): with NamedTemporaryFile(dir=config.temp_dir) as dst,\ NamedTemporaryFile(dir=config.temp_dir) as dst_best: track_cost = TrackTheBest("cost", after_epoch=False, after_batch=True) main_loop = MockMainLoop( extensions=[FinishAfter(after_n_epochs=1), WriteCostExtension(), track_cost, Checkpoint(dst.name, after_batch=True, save_separately=['log']) .add_condition( ["after_batch"], OnLogRecord(track_cost.notification_name), (dst_best.name,))]) main_loop.run() assert main_loop.log[4]['saved_to'] == (dst.name, dst_best.name) assert main_loop.log[5]['saved_to'] == (dst.name, dst_best.name) assert main_loop.log[6]['saved_to'] == (dst.name,) with open(dst_best.name, 'rb') as src: assert load(src).log.status['iterations_done'] == 5 root, ext = os.path.splitext(dst_best.name) log_path = root + "_log" + ext with open(log_path, 'rb') as src: assert cPickle.load(src).status['iterations_done'] == 5
def track_best(channel, save_path): tracker = TrackTheBest(channel, choose_best=min) checkpoint = saveload.Checkpoint(save_path, after_training=False, use_cpickle=True) checkpoint.add_condition(["after_epoch"], predicate=predicates.OnLogRecord( '{0}_best_so_far'.format(channel))) return [tracker, checkpoint]
def track_best(self, channel, save_path=None, choose_best=min): tracker = TrackTheBest(channel, choose_best=choose_best) self.extensions.append(tracker) if save_path: checkpoint = saveload.Checkpoint(save_path, after_training=False, use_cpickle=True) checkpoint.add_condition(["after_epoch"], predicate=predicates.OnLogRecord( '{0}_best_so_far'.format(channel))) self.extensions.append(checkpoint)
def track_best(channel, save_path, last_path, num_epochs, maxEpochs, maxIterations, epsilon, tempSharedData): tracker = TrackTheBest(channel) finishNoimprove = FinishIfNoImprovementEpsilonAfter( 'valid_MRR_best_so_far', epochs=num_epochs, epsilon=epsilon) lastcheckpoint = myCheckpoint(last_path, tempSharedData) checkpoint = myCheckpoint(save_path, tempSharedData, after_training=False, use_cpickle=True) checkpoint.add_condition(["after_epoch"], predicate=predicates.OnLogRecord( '{0}_best_so_far'.format(channel))) finishAfter = FinishAfter(after_n_epochs=maxEpochs, after_n_batches=maxIterations) return [tracker, finishNoimprove, lastcheckpoint, checkpoint, finishAfter]
def test_save_the_best(): skip_if_configuration_set('log_backend', 'sqlite', "Known to be flaky with SQLite log backend.") with NamedTemporaryFile(dir=config.temp_dir) as dst,\ NamedTemporaryFile(dir=config.temp_dir) as dst_best: track_cost = TrackTheBest("cost", after_epoch=False, after_batch=True) main_loop = MockMainLoop(extensions=[ FinishAfter(after_n_epochs=1), WriteCostExtension(), track_cost, Checkpoint(dst.name, after_batch=True, save_separately=['log']). add_condition(["after_batch"], OnLogRecord(track_cost.notification_name), ( dst_best.name, )) ]) main_loop.run() assert main_loop.log[4]['saved_to'] == (dst.name, dst_best.name) assert main_loop.log[5]['saved_to'] == (dst.name, dst_best.name) assert main_loop.log[6]['saved_to'] == (dst.name, ) with open(dst_best.name, 'rb') as src: assert load(src).log.status['iterations_done'] == 5
host_plot = 'http://tfjgeorge.com:5006' cost.name = 'cost' valid_cost.name = 'valid_cost' extensions = [ Timing(), TrainingDataMonitoring([cost], after_epoch=True, prefix='train'), DataStreamMonitoring(variables=[valid_cost], data_stream=valid_stream), Plot('%s %s' % ( socket.gethostname(), datetime.datetime.now(), ), channels=[['train_cost', 'valid_cost']], after_epoch=True, server_url=host_plot), TrackTheBest('valid_cost'), Checkpoint('model', save_separately=["model", "log"]), FinishIfNoImprovementAfter('valid_cost_best_so_far', epochs=5), #FinishAfter(after_n_epochs=100), Printing() ] from blocks.main_loop import MainLoop main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(): nclasses = 27 import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=1) parser.add_argument("--length", type=int, default=180) parser.add_argument("--num-epochs", type=int, default=100) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=1e-3) parser.add_argument("--epsilon", type=float, default=1e-5) parser.add_argument("--num-hidden", type=int, default=1000) parser.add_argument("--baseline", action="store_true") parser.add_argument("--initialization", choices="identity glorot orthogonal uniform".split(), default="identity") parser.add_argument("--initial-gamma", type=float, default=1e-1) parser.add_argument("--initial-beta", type=float, default=0) parser.add_argument("--cluster", action="store_true") parser.add_argument("--activation", choices=list(activations.keys()), default="tanh") parser.add_argument("--optimizer", choices="sgdmomentum adam rmsprop", default="rmsprop") parser.add_argument("--continue-from") parser.add_argument("--evaluate") parser.add_argument("--dump-hiddens") args = parser.parse_args() np.random.seed(args.seed) blocks.config.config.default_seed = args.seed if args.continue_from: from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses) ### optimization algorithm definition if args.optimizer == "adam": optimizer = Adam(learning_rate=args.learning_rate) elif args.optimizer == "rmsprop": optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9) elif args.optimizer == "sgdmomentum": optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99) step_rule = CompositeRule([ StepClipping(1.), optimizer, ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ]) step_channels.append( algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append( algorithm.total_gradient_norm.copy(name="total_gradient_norm")) step_channels.extend(graphs["training"].outputs) logger.warning("constructing training data monitor") extensions.append( TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True)) # parameter monitor extensions.append( DataStreamMonitoring([ param.norm(2).copy(name="parameter.norm:%s" % name) for name, param in model.get_parameter_dict().items() ], data_stream=None, after_epoch=True)) validation_interval = 500 # performance monitor for situation in "training inference".split(): if situation == "inference" and not args.evaluate: # save time when we don't need the inference graph continue for which_set in "train valid test".split(): logger.warning("constructing %s %s monitor" % (which_set, situation)) channels = list(graphs[situation].outputs) extensions.append( DataStreamMonitoring(channels, prefix="%s_%s" % (which_set, situation), every_n_batches=validation_interval, data_stream=get_stream( which_set=which_set, batch_size=args.batch_size, num_examples=10000, length=args.length))) extensions.extend([ TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"), DumpBest("best_valid_training_error_rate", "best.zip"), FinishAfter(after_n_epochs=args.num_epochs), #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50), Checkpoint("checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True), DumpLog("log.pkl", after_epoch=True) ]) if not args.cluster: extensions.append(ProgressBar()) extensions.extend([ Timing(), Printing(every_n_batches=validation_interval), PrintingTo("log"), ]) main_loop = MainLoop(data_stream=get_stream(which_set="train", batch_size=args.batch_size, length=args.length, augment=True), algorithm=algorithm, extensions=extensions, model=model) if args.dump_hiddens: dump_hiddens(args, main_loop) return if args.evaluate: evaluate(args, main_loop) return main_loop.run()
def train_language_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed): c = config if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed data, lm, retrieval = initialize_data_and_model(config) # full main loop can be saved... main_loop_path = os.path.join(save_path, 'main_loop.tar') # or only state (log + params) which can be useful not to pickle embeddings state_path = os.path.join(save_path, 'training_state.tar') stream_path = os.path.join(save_path, 'stream.pkl') best_tar_path = os.path.join(save_path, "best_model.tar") words = tensor.ltensor3('words') words_mask = tensor.matrix('words_mask') if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4, max_length=5).get_epoch_iterator()) words.tag.test_value = test_value_data[0] words_mask.tag.test_value = test_value_data[1] costs, updates = lm.apply(words, words_mask) cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(words.shape[1], 'length') perplexity, = VariableFilter(name='perplexity')(cg) perplexities = VariableFilter(name_regex='perplexity.*')(cg) monitored_vars = [length, cost] + perplexities if c['dict_path']: num_definitions, = VariableFilter(name='num_definitions')(cg) monitored_vars.extend([num_definitions]) parameters = cg.get_parameter_dict() trained_parameters = parameters.values() saved_parameters = parameters.values() if c['embedding_path']: logger.debug("Exclude word embeddings from the trained parameters") trained_parameters = [ p for p in trained_parameters if not p == lm.get_def_embeddings_params() ] saved_parameters = [ p for p in saved_parameters if not p == lm.get_def_embeddings_params() ] if c['cache_size'] != 0: logger.debug("Enable fake recursivity for looking up embeddings") trained_parameters = [ p for p in trained_parameters if not p == lm.get_cache_params() ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) if c['cache_size'] != 0: algorithm.add_updates(updates) train_monitored_vars = list(monitored_vars) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg) main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg) train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS]) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) # We use a completely random seed on purpose. With Fuel server # it's currently not possible to restore the state of the training # stream. That's why it's probably better to just have it stateless. stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None training_stream = data.get_stream('train', batch_size=c['batch_size'], max_length=c['max_length'], seed=stream_seed) valid_stream = data.get_stream('valid', batch_size=c['batch_size_valid'], max_length=c['max_length'], seed=stream_seed) original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) validation = DataStreamMonitoring(monitored_vars, valid_stream, prefix="valid").set_conditions( before_first_epoch=not fast_start, on_resumption=True, every_n_batches=c['mon_freq_valid']) track_the_best = TrackTheBest(validation.record_name(perplexity), choose_best=min).set_conditions( on_resumption=True, after_epoch=True, every_n_batches=c['mon_freq_valid']) # don't save them the entire main loop to avoid pickling everything if c['fast_checkpoint']: load = (LoadNoUnpickling(state_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_main_loop': False, 'save_separately': ['log', 'iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(state_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches']: intermediate_cp = IntermediateCheckpoint( state_path, every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) else: load = (Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_separately': ['iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(main_loop_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches']: intermediate_cp = IntermediateCheckpoint( main_loop_path, every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) checkpoint = checkpoint.add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (best_tar_path, )) extensions = [ load, StartFuelServer(original_training_stream, stream_path, before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']) ] if retrieval: extensions.append( RetrievalPrintStats(retrieval=retrieval, every_n_batches=c['mon_freq_train'], before_training=not fast_start)) extensions.extend([ TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), validation, track_the_best, checkpoint ]) if c['checkpoint_every_n_batches']: extensions.append(intermediate_cp) extensions.extend([ DumpTensorflowSummaries(save_path, every_n_batches=c['mon_freq_train'], after_training=True), Printing(on_resumption=True, every_n_batches=c['mon_freq_train']), FinishIfNoImprovementAfter(track_the_best.notification_name, iterations=50 * c['mon_freq_valid'], every_n_batches=c['mon_freq_valid']), FinishAfter(after_n_batches=c['n_batches']) ]) logger.info("monitored variables during training:" + "\n" + pprint.pformat(train_monitored_vars, width=120)) logger.info("monitored variables during valid:" + "\n" + pprint.pformat(monitored_vars, width=120)) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
def build_and_run(save_to,modelconfig,experimentconfig): """ part of this is adapted from lasagne tutorial""" n, num_filters, image_size, num_blockstack = modelconfig['depth'], modelconfig['num_filters'], modelconfig['image_size'], modelconfig['num_blockstack'] print("Amount of bottlenecks: %d" % n) # Prepare Theano variables for inputs and targets input_var = T.tensor4('image_features') #target_value = T.ivector('targets') target_var = T.lmatrix('targets') target_vec = T.extra_ops.to_one_hot(target_var[:,0],2) #target_var = T.matrix('targets') # Create residual net model print("Building model...") network = build_cnn(input_var, image_size, n, num_blockstack, num_filters) get_info(network) prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network)) test_prediction = lasagne.utils.as_theano_expression(lasagne.layers.get_output(network,deterministic=True)) # Loss function -> The objective to minimize print("Instanciation of loss function...") #loss = CategoricalCrossEntropy().apply(target_var.flatten(), prediction) #test_loss = CategoricalCrossEntropy().apply(target_var.flatten(), test_prediction) # loss = lasagne.objectives.categorical_crossentropy(prediction, target_var.flatten()).mean() # test_loss = lasagne.objectives.categorical_crossentropy(test_prediction, target_var.flatten()).mean() loss = lasagne.objectives.squared_error(prediction,target_vec).mean() test_loss = lasagne.objectives.squared_error(test_prediction,target_vec).mean() # loss = tensor.nnet.binary_crossentropy(prediction, target_var).mean() # test_loss = tensor.nnet.binary_crossentropy(test_prediction, target_var).mean() test_loss.name = "loss" # loss.name = 'x-ent_error' # loss.name = 'sqr_error' layers = lasagne.layers.get_all_layers(network) #l1 and l2 regularization #pondlayers = {x:0.000025 for i,x in enumerate(layers)} #l1_penality = lasagne.regularization.regularize_layer_params_weighted(pondlayers, lasagne.regularization.l2) #l2_penality = lasagne.regularization.regularize_layer_params(layers[len(layers)/4:], lasagne.regularization.l1) * 25e-6 #reg_penalty = l1_penality + l2_penality #reg_penalty.name = 'reg_penalty' #loss = loss + reg_penalty loss.name = 'reg_loss' error_rate = MisclassificationRate().apply(target_var.flatten(), test_prediction).copy( name='error_rate') # Load the dataset print("Loading data...") istest = 'test' in experimentconfig.keys() if istest: print("Using test stream") train_stream, valid_stream, test_stream = get_stream(experimentconfig['batch_size'],image_size,test=istest) # Defining step rule and algorithm if 'step_rule' in experimentconfig.keys() and not experimentconfig['step_rule'] is None : step_rule = experimentconfig['step_rule'](learning_rate=experimentconfig['learning_rate']) else : step_rule=Scale(learning_rate=experimentconfig['learning_rate']) params = map(lasagne.utils.as_theano_expression,lasagne.layers.get_all_params(network, trainable=True)) print("Initializing algorithm") algorithm = GradientDescent( cost=loss, gradients={var:T.grad(loss,var) for var in params},#parameters=cg.parameters, #params step_rule=step_rule) #algorithm.add_updates(extra_updates) grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = "grad_norm" print("Initializing extensions...") plot = Plot(save_to, channels=[['train_loss','valid_loss'], ['train_grad_norm'], #['train_grad_norm','train_reg_penalty'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042') checkpoint = Checkpoint('models/best_'+save_to+'.tar') # checkpoint.add_condition(['after_n_batches=25'], checkpoint.add_condition(['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Defining extensions extensions = [Timing(), FinishAfter(after_n_epochs=experimentconfig['num_epochs'], after_n_batches=experimentconfig['num_batches']), TrainingDataMonitoring([test_loss, error_rate, grad_norm], # reg_penalty], prefix="train", after_epoch=True), #after_n_epochs=1 DataStreamMonitoring([test_loss, error_rate],valid_stream,prefix="valid", after_epoch=True), #after_n_epochs=1 plot, #Checkpoint(save_to,after_n_epochs=5), #ProgressBar(), # Plot(save_to, channels=[['train_loss','valid_loss'], ['train_error_rate','valid_error_rate']], server_url='http://hades.calculquebec.ca:5042'), #'grad_norm' # after_batch=True), Printing(after_epoch=True), TrackTheBest('valid_error_rate',min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=5)] # Early-stopping # model = Model(loss) # print("Model",model) main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) print("Starting main loop...") main_loop.run()
[algorithm.total_step_norm, algorithm.total_gradient_norm], every_n_batches=n_batches, prefix="train") valid_monitor = DataStreamMonitoring(variables, valid_stream, every_n_batches=n_batches, prefix="valid") from utils import _is_nan extensions = extensions = [ train_monitor, valid_monitor, TrackTheBest('valid_nll', every_n_batches=n_batches, before_first_epoch=True), Timing(every_n_batches=n_batches), Printing(every_n_batches=n_batches), Write(generator, save_name=save_dir + "samples/" + exp_name + ".png").add_condition( ["after_batch"], predicate=OnLogRecord('valid_nll_best_so_far')), FinishAfter().add_condition(["after_batch"], _is_nan), #Checkpoint(save_dir + "pkl/" + exp_name + ".pkl",after_epoch = True), Checkpoint(save_dir + "pkl/best_" + exp_name + ".pkl").add_condition( ["after_batch"], predicate=OnLogRecord('valid_nll_best_so_far')), SaveComputationGraph(emit), Plot(save_dir + "progress/" + exp_name + ".png", [['train_nll', 'valid_nll']], every_n_batches=5 * n_batches, email=False),
def run_pretrain(model, hyper_params, cost, train_data, valid_data=None, extra_costs=None): """ generic training method for neural networks; works with any network structure :return: """ from fuel.streams import DataStream from fuel.schemes import SequentialScheme, ShuffledScheme from blocks.filter import VariableFilter from blocks.graph import ComputationGraph from blocks.roles import WEIGHT from blocks.algorithms import GradientDescent, Adam, RMSProp, Scale from blocks.extensions import FinishAfter, Timing, Printing, ProgressBar from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.extensions.predicates import OnLogRecord from blocks.monitoring import aggregation from blocks.main_loop import MainLoop from blocks.extensions.training import TrackTheBest from deepthought.extensions.parameters import BestParams if extra_costs is None: extra_costs = [] cg = ComputationGraph([cost]) # TODO: more hyper-params for regularization # L1 regularization if hyper_params['l1wdecay'] > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + hyper_params['l1wdecay'] * sum([abs(W).sum() for W in weights]) cost.name = 'cost' # set up step_rule if hyper_params['step_rule'] == 'Adam': step_rule = Adam(learning_rate=hyper_params['learning_rate']) elif hyper_params['step_rule'] == 'RMSProp': step_rule = RMSProp(learning_rate=hyper_params['learning_rate']) #, decay_rate=0.9, max_scaling=1e5) else: step_rule = Scale(learning_rate=hyper_params['learning_rate']) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) if 'blocks_print_variable_names' in hyper_params and hyper_params['blocks_print_variable_names']: print 'cg.variables:', cg.variables train_monitoring_vars = [cost] + extra_costs + [aggregation.mean(algorithm.total_gradient_norm)] for var_name in hyper_params['blocks_extensions_train_monitoring_channels']: for v in cg.variables: if v.name == var_name: print 'Monitoring variable:', v train_monitoring_vars.append(v) # default extensions extensions = [Timing(), FinishAfter(after_n_epochs=hyper_params['max_epochs']), TrainingDataMonitoring( train_monitoring_vars, suffix="train", after_epoch=True) ] # additional stuff if validation set is used if valid_data is not None: valid_monitoring_vars = [cost] + extra_costs for var_name in hyper_params['blocks_extensions_valid_monitoring_channels']: for v in cg.variables: if v.name == var_name: print 'Monitoring variable:', v valid_monitoring_vars.append(v) extensions.append( DataStreamMonitoring( valid_monitoring_vars, DataStream.default_stream( valid_data, iteration_scheme=SequentialScheme( valid_data.num_examples, hyper_params['batch_size'])), suffix="valid")) best_channel = 'cost_valid' print '#train:', train_data.num_examples, '#valid:', valid_data.num_examples else: best_channel = 'cost_train' print '#train:', train_data.num_examples # tracking of the best best_params = BestParams() best_params.add_condition(['after_epoch'], predicate=OnLogRecord(best_channel + '_best_so_far')) extensions.append(TrackTheBest(best_channel)) extensions.append(best_params) # after TrackTheBest! # printing and plotting if hyper_params['blocks_extensions_printing'] is True: extensions.append(Printing()) # optional if hyper_params['blocks_extensions_progressbar'] is True: extensions.append(ProgressBar()) if hyper_params['blocks_extensions_bokeh'] is True: try: from blocks_extras.extensions.plot import Plot bokeh_available = True except: bokeh_available = False print 'bokeh available: ', bokeh_available if bokeh_available: extensions.append(Plot( hyper_params['blocks_extensions_bokeh_plot_title'], channels=hyper_params['blocks_extensions_bokeh_channels'], )) main_loop = MainLoop( algorithm, DataStream.default_stream( train_data, iteration_scheme=ShuffledScheme( train_data.num_examples, hyper_params['batch_size'])), model=model, extensions=extensions) main_loop.run() return best_params.values, main_loop.status['best_' + best_channel]
def construct_main_loop(name, task_name, patch_shape, batch_size, n_spatial_dims, n_patches, max_epochs, patience_epochs, learning_rate, gradient_limiter, hyperparameters, **kwargs): task = tasks.get_task(**hyperparameters) hyperparameters["n_channels"] = task.n_channels extensions = [] # let theta noise decay as training progresses for key in "location_std scale_std".split(): hyperparameters[key] = theano.shared(hyperparameters[key], name=key) extensions.append( util.ExponentialDecay(hyperparameters[key], hyperparameters["%s_decay" % key], after_batch=True)) print "constructing graphs..." graphs, outputs, updates = construct_graphs(task=task, **hyperparameters) print "setting up main loop..." from blocks.model import Model model = Model(outputs["train"]["cost"]) from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam, RMSProp from extensions import Compressor if gradient_limiter == "clip": limiter = StepClipping(1.) elif gradient_limiter == "compress": limiter = Compressor() else: raise ValueError() algorithm = GradientDescent( cost=outputs["train"]["cost"], parameters=graphs["train"].parameters, step_rule=CompositeRule([limiter, Adam(learning_rate=learning_rate)])) algorithm.add_updates(updates["train"]) extensions.extend( construct_monitors(algorithm=algorithm, task=task, model=model, graphs=graphs, outputs=outputs, updates=updates, **hyperparameters)) from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing from blocks.extensions.stopping import FinishIfNoImprovementAfter from blocks.extensions.training import TrackTheBest from blocks.extensions.saveload import Checkpoint from dump import DumpBest, LightCheckpoint, PrintingTo, DumpGraph, DumpLog extensions.extend([ TrackTheBest("valid_error_rate", "best_valid_error_rate"), FinishIfNoImprovementAfter("best_valid_error_rate", epochs=patience_epochs), FinishAfter(after_n_epochs=max_epochs), DumpBest("best_valid_error_rate", name + "_best.zip"), Checkpoint(hyperparameters["checkpoint_save_path"], on_interrupt=False, every_n_epochs=10, use_cpickle=True), DumpLog("log.pkl", after_epoch=True), ProgressBar(), Timing(), Printing(), PrintingTo(name + "_log"), DumpGraph(name + "_grad_graph") ]) from blocks.main_loop import MainLoop main_loop = MainLoop(data_stream=task.get_stream("train"), algorithm=algorithm, extensions=extensions, model=model) from tabulate import tabulate print "parameter sizes:" print tabulate( (key, "x".join(map(str, value.get_value().shape)), value.get_value().size) for key, value in main_loop.model.get_parameter_dict().items()) return main_loop
def training(self, fea2obj, batch_size, learning_rate=0.005, steprule='adagrad', wait_epochs=5, kl_weight_init=None, klw_ep=50, klw_inc_rate=0, num_epochs=None): networkfile = self._config['net'] n_epochs = num_epochs or int(self._config['nepochs']) reg_weight = float(self._config['loss_weight']) reg_type = self._config['loss_reg'] numtrain = int( self._config['num_train']) if 'num_train' in self._config else None train_stream, num_samples_train = get_comb_stream( fea2obj, 'train', batch_size, shuffle=True, num_examples=numtrain) dev_stream, num_samples_dev = get_comb_stream(fea2obj, 'dev', batch_size=None, shuffle=False) logger.info('sources: %s -- number of train/dev samples: %d/%d', train_stream.sources, num_samples_train, num_samples_dev) t2idx = fea2obj['targets'].t2idx klw_init = kl_weight_init or float( self._config['kld_weight']) if 'kld_weight' in self._config else 1 logger.info('kl_weight_init: %d', klw_init) kl_weight = shared_floatx(klw_init, 'kl_weight') entropy_weight = shared_floatx(1., 'entropy_weight') cost, p_at_1, _, KLD, logpy_xz, pat1_recog, misclassify_rate = build_model_new( fea2obj, len(t2idx), self._config, kl_weight, entropy_weight) cg = ComputationGraph(cost) weights = VariableFilter(roles=[WEIGHT])(cg.parameters) logger.info('Model weights are: %s', weights) if 'L2' in reg_type: cost += reg_weight * l2_norm(weights) logger.info('applying %s with weight: %f ', reg_type, reg_weight) dropout = -0.1 if dropout > 0: cg = apply_dropout(cg, weights, dropout) cost = cg.outputs[0] cost.name = 'cost' logger.info('Our Algorithm is : %s, and learning_rate: %f', steprule, learning_rate) if 'adagrad' in steprule: cnf_step_rule = AdaGrad(learning_rate) elif 'adadelta' in steprule: cnf_step_rule = AdaDelta(decay_rate=0.95) elif 'decay' in steprule: cnf_step_rule = RMSProp(learning_rate=learning_rate, decay_rate=0.90) cnf_step_rule = CompositeRule([cnf_step_rule, StepClipping(1)]) elif 'momentum' in steprule: cnf_step_rule = Momentum(learning_rate=learning_rate, momentum=0.9) elif 'adam' in steprule: cnf_step_rule = Adam(learning_rate=learning_rate) else: logger.info('The steprule param is wrong! which is: %s', steprule) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=cnf_step_rule, on_unused_sources='warn') #algorithm.add_updates(updates) gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [ cost, gradient_norm, step_norm, p_at_1, KLD, logpy_xz, kl_weight, pat1_recog ] train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') dev_monitor = DataStreamMonitoring(variables=[ cost, p_at_1, KLD, logpy_xz, pat1_recog, misclassify_rate ], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") extensions = [ dev_monitor, train_monitor, Timing(), TrackTheBest('dev_cost'), FinishIfNoImprovementAfter('dev_cost_best_so_far', epochs=wait_epochs), Printing(after_batch=False), #, ProgressBar() FinishAfter(after_n_epochs=n_epochs), saveload.Load(networkfile + '.toload.pkl'), ] + track_best('dev_cost', networkfile + '.best.pkl') #extensions.append(SharedVariableModifier(kl_weight, # lambda n, klw: numpy.cast[theano.config.floatX] (klw_inc_rate + klw), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) # extensions.append(SharedVariableModifier(entropy_weight, # lambda n, crw: numpy.cast[theano.config.floatX](crw - klw_inc_rate), after_epoch=False, every_n_epochs=klw_ep, after_batch=False)) logger.info('number of parameters in the model: %d', tensor.sum([p.size for p in cg.parameters]).eval()) logger.info('Lookup table sizes: %s', [p.size.eval() for p in cg.parameters if 'lt' in p.name]) main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
extensions = [] if args.load_experiment and (not worker or worker.is_main_worker): extensions += [Load(os.path.join( save_dir, "pkl", load_prefix + args.load_experiment + ".tar"))] extensions += [ Timing(every_n_batches=args.save_every), train_monitor] if not worker or worker.is_main_worker: extensions += [ valid_monitor, TrackTheBest( 'valid_nll', every_n_batches=args.save_every, before_first_epoch=True), Plot( os.path.join(save_dir, "progress", exp_name + ".png"), plot_names, every_n_batches=args.save_every, email=False), Checkpoint( os.path.join(save_dir, "pkl", "best_" + exp_name + ".tar"), after_training=False, save_separately=['log'], use_cpickle=True, save_main_loop=False, before_first_epoch=True) .add_condition( ["after_batch", "before_training"],
error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) ########### GET THE DATA ##################### stream_train = ServerDataStream(('image_features', 'targets'), False, port=5652, hwm=50) stream_valid = ServerDataStream(('image_features', 'targets'), False, port=5653, hwm=50) ########### DEFINE THE ALGORITHM ############# track_cost = TrackTheBest("cost", after_epoch=True, after_batch=False) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum(learning_rate=0.0001, momentum=0.9)) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid",
theano.function([x, x_m], y_hat_softmax), before_first_epoch=False, every_n_epochs=5, prefix='testPER') checkpoint = Checkpoint(conf.path_to_model, after_training=False) checkpoint.add_condition( ['after_epoch'], predicate=predicates.OnLogRecord('valid_log_p_best_so_far')) extensions = [ val_monitor, train_monitor, per_val_monitor, per_test_monitor, Timing(), FinishAfter(after_n_epochs=conf.max_epochs), checkpoint, Printing(), TrackTheBest(record_name='val_monitor', notification_name='valid_log_p_best_so_far'), FinishIfNoImprovementAfter(notification_name='valid_log_p_best_so_far', epochs=conf.epochs_early_stopping), ] main_loop = MainLoop( algorithm=algorithm, data_stream=stream_train, model=model, extensions=extensions, ) main_loop.run()
def train_snli_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed, model='simple'): if config['exclude_top_k'] > config['num_input_words'] and config[ 'num_input_words'] > 0: raise Exception("Some words have neither word nor def embedding") c = config logger = configure_logger(name="snli_baseline_training", log_file=os.path.join(save_path, "log.txt")) if not os.path.exists(save_path): logger.info("Start a new job") os.mkdir(save_path) else: logger.info("Continue an existing job") with open(os.path.join(save_path, "cmd.txt"), "w") as f: f.write(" ".join(sys.argv)) # Make data paths nice for path in [ 'dict_path', 'embedding_def_path', 'embedding_path', 'vocab', 'vocab_def', 'vocab_text' ]: if c.get(path, ''): if not os.path.isabs(c[path]): c[path] = os.path.join(fuel.config.data_path[0], c[path]) main_loop_path = os.path.join(save_path, 'main_loop.tar') main_loop_best_val_path = os.path.join(save_path, 'main_loop_best_val.tar') stream_path = os.path.join(save_path, 'stream.pkl') # Save config to save_path json.dump(config, open(os.path.join(save_path, "config.json"), "w")) if model == 'simple': nli_model, data, used_dict, used_retrieval, _ = _initialize_simple_model_and_data( c) elif model == 'esim': nli_model, data, used_dict, used_retrieval, _ = _initialize_esim_model_and_data( c) else: raise NotImplementedError() # Compute cost s1, s2 = T.lmatrix('sentence1'), T.lmatrix('sentence2') if c['dict_path']: assert os.path.exists(c['dict_path']) s1_def_map, s2_def_map = T.lmatrix('sentence1_def_map'), T.lmatrix( 'sentence2_def_map') def_mask = T.fmatrix("def_mask") defs = T.lmatrix("defs") else: s1_def_map, s2_def_map = None, None def_mask = None defs = None s1_mask, s2_mask = T.fmatrix('sentence1_mask'), T.fmatrix('sentence2_mask') y = T.ivector('label') cg = {} for train_phase in [True, False]: # NOTE: Please don't change outputs of cg if train_phase: with batch_normalization(nli_model): pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) else: pred = nli_model.apply(s1, s1_mask, s2, s2_mask, def_mask=def_mask, defs=defs, s1_def_map=s1_def_map, s2_def_map=s2_def_map, train_phase=train_phase) cost = CategoricalCrossEntropy().apply(y.flatten(), pred) error_rate = MisclassificationRate().apply(y.flatten(), pred) cg[train_phase] = ComputationGraph([cost, error_rate]) # Weight decay (TODO: Make it less bug prone) if model == 'simple': weights_to_decay = VariableFilter( bricks=[dense for dense, relu, bn in nli_model._mlp], roles=[WEIGHT])(cg[True].variables) weight_decay = np.float32(c['l2']) * sum( (w**2).sum() for w in weights_to_decay) elif model == 'esim': weight_decay = 0.0 else: raise NotImplementedError() final_cost = cg[True].outputs[0] + weight_decay final_cost.name = 'final_cost' # Add updates for population parameters if c.get("bn", True): pop_updates = get_batch_normalization_updates(cg[True]) extra_updates = [(p, m * 0.1 + p * (1 - 0.1)) for p, m in pop_updates] else: pop_updates = [] extra_updates = [] if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: loaded_params = load_parameters(src) cg[True].set_parameter_values(loaded_params) for param, m in pop_updates: param.set_value(loaded_params[get_brick( param).get_hierarchical_name(param)]) if os.path.exists(os.path.join(save_path, "main_loop.tar")): logger.warning("Manually loading BN stats :(") with open(os.path.join(save_path, "main_loop.tar")) as src: loaded_params = load_parameters(src) for param, m in pop_updates: param.set_value( loaded_params[get_brick(param).get_hierarchical_name(param)]) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4).get_epoch_iterator()) s1.tag.test_value = test_value_data[0] s1_mask.tag.test_value = test_value_data[1] s2.tag.test_value = test_value_data[2] s2_mask.tag.test_value = test_value_data[3] y.tag.test_value = test_value_data[4] # Freeze embeddings if not c['train_emb']: frozen_params = [ p for E in nli_model.get_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params) & set(train_params)) > 0 else: frozen_params = [] if not c.get('train_def_emb', 1): frozen_params_def = [ p for E in nli_model.get_def_embeddings_lookups() for p in E.parameters ] train_params = [p for p in cg[True].parameters] assert len(set(frozen_params_def) & set(train_params)) > 0 frozen_params += frozen_params_def train_params = [p for p in cg[True].parameters if p not in frozen_params] train_params_keys = [ get_brick(p).get_hierarchical_name(p) for p in train_params ] # Optimizer algorithm = GradientDescent(cost=final_cost, on_unused_sources='ignore', parameters=train_params, step_rule=Adam(learning_rate=c['lr'])) algorithm.add_updates(extra_updates) m = Model(final_cost) parameters = m.get_parameter_dict() # Blocks version mismatch logger.info("Trainable parameters" + "\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(train_params_keys)], width=120)) logger.info("# of parameters {}".format( sum([ np.prod(parameters[key].get_value().shape) for key in sorted(train_params_keys) ]))) ### Monitored args ### train_monitored_vars = [final_cost] + cg[True].outputs monitored_vars = cg[False].outputs val_acc = monitored_vars[1] to_monitor_names = [ 'def_unk_ratio', 's1_merged_input_rootmean2', 's1_def_mean_rootmean2', 's1_gate_rootmean2', 's1_compose_gate_rootmean2' ] for k in to_monitor_names: train_v, valid_v = VariableFilter(name=k)( cg[True]), VariableFilter(name=k)(cg[False]) if len(train_v): logger.info("Adding {} tracking".format(k)) train_monitored_vars.append(train_v[0]) monitored_vars.append(valid_v[0]) else: logger.warning("Didnt find {} in cg".format(k)) if c['monitor_parameters']: for name in train_params_keys: param = parameters[name] num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements grad_norm = algorithm.gradients[param].norm(2) / num_elements step_norm = algorithm.steps[param].norm(2) / num_elements stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' train_monitored_vars.append(stats) regular_training_stream = data.get_stream('train', batch_size=c['batch_size'], seed=seed) if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=regular_training_stream.sources, hwm=100, produces_examples=regular_training_stream.produces_examples) else: training_stream = regular_training_stream ### Build extensions ### extensions = [ # Load(main_loop_path, load_iteration_state=True, load_log=True) # .set_conditions(before_training=not new_training_job), StartFuelServer(regular_training_stream, stream_path, hwm=100, script_path=os.path.join( os.path.dirname(__file__), "../bin/start_fuel_server.py"), before_training=fuel_server), Timing(every_n_batches=c['mon_freq']), ProgressBar(), RetrievalPrintStats(retrieval=used_retrieval, every_n_batches=c['mon_freq_valid'], before_training=not fast_start), Timestamp(), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq']), ] if c['layout'] == 'snli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid', batch_size=14, seed=seed), before_training=not fast_start, on_resumption=True, after_training=True, every_n_batches=c['mon_freq_valid'], prefix='valid') extensions.append(validation) elif c['layout'] == 'mnli': validation = DataStreamMonitoring(monitored_vars, data.get_stream('valid_matched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], on_resumption=True, after_training=True, prefix='valid_matched') validation_mismatched = DataStreamMonitoring( monitored_vars, data.get_stream('valid_mismatched', batch_size=14, seed=seed), every_n_batches=c['mon_freq_valid'], before_training=not fast_start, on_resumption=True, after_training=True, prefix='valid_mismatched') extensions.extend([validation, validation_mismatched]) else: raise NotImplementedError() # Similarity trackers for embeddings if len(c.get('vocab_def', '')): retrieval_vocab = Vocabulary(c['vocab_def']) else: retrieval_vocab = data.vocab retrieval_all = Retrieval(vocab_text=retrieval_vocab, dictionary=used_dict, max_def_length=c['max_def_length'], exclude_top_k=0, max_def_per_word=c['max_def_per_word']) for name in [ 's1_word_embeddings', 's1_dict_word_embeddings', 's1_translated_word_embeddings' ]: variables = VariableFilter(name=name)(cg[False]) if len(variables): s1_emb = variables[0] logger.info("Adding similarity tracking for " + name) # A bit sloppy about downcast if "dict" in name: embedder = construct_dict_embedder(theano.function( [s1, defs, def_mask, s1_def_map], s1_emb, allow_input_downcast=True), vocab=data.vocab, retrieval=retrieval_all) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) else: embedder = construct_embedder(theano.function( [s1], s1_emb, allow_input_downcast=True), vocab=data.vocab) extensions.append( SimilarityWordEmbeddingEval( embedder=embedder, prefix=name, every_n_batches=c['mon_freq_valid'], before_training=not fast_start)) track_the_best = TrackTheBest(validation.record_name(val_acc), before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start, every_n_batches=c['mon_freq_valid'], choose_best=min) extensions.append(track_the_best) # Special care for serializing embeddings if len(c.get('embedding_path', '')) or len(c.get('embedding_def_path', '')): extensions.insert( 0, LoadNoUnpickling(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=train_params + [p for p, m in pop_updates], save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) else: extensions.insert( 0, Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) extensions.append( Checkpoint(main_loop_path, parameters=cg[True].parameters + [p for p, m in pop_updates], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (main_loop_best_val_path, ))) extensions.extend([ DumpCSVSummaries(save_path, every_n_batches=c['mon_freq_valid'], after_training=True), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_valid'], after_training=True), Printing(every_n_batches=c['mon_freq_valid']), PrintMessage(msg="save_path={}".format(save_path), every_n_batches=c['mon_freq']), FinishAfter(after_n_batches=c['n_batches']).add_condition( ['after_batch'], OnLogStatusExceed('iterations_done', c['n_batches'])) ]) logger.info(extensions) ### Run training ### if "VISDOM_SERVER" in os.environ: print("Running visdom server") ret = subprocess.Popen([ os.path.join(os.path.dirname(__file__), "../visdom_plotter.py"), "--visdom-server={}".format(os.environ['VISDOM_SERVER']), "--folder={}".format(save_path) ]) time.sleep(0.1) if ret.returncode is not None: raise Exception() atexit.register(lambda: os.kill(ret.pid, signal.SIGINT)) model = Model(cost) for p, m in pop_updates: model._parameter_dict[get_brick(p).get_hierarchical_name(p)] = p main_loop = MainLoop(algorithm, training_stream, model=model, extensions=extensions) assert os.path.exists(save_path) main_loop.run()
def test_track_the_best(): main_loop = MockMainLoop() extension = TrackTheBest("cost") extension.main_loop = main_loop main_loop.status['epochs_done'] += 1 main_loop.status['iterations_done'] += 10 main_loop.log.current_row['cost'] = 5 extension.dispatch('after_epoch') assert main_loop.status['best_cost'] == 5 assert main_loop.log.current_row['cost_best_so_far'] main_loop.status['epochs_done'] += 1 main_loop.status['iterations_done'] += 10 main_loop.log.current_row['cost'] = 6 extension.dispatch('after_epoch') assert main_loop.status['best_cost'] == 5 assert main_loop.log.current_row.get('cost_best_so_far', None) is None main_loop.status['epochs_done'] += 1 main_loop.status['iterations_done'] += 10 main_loop.log.current_row['cost'] = 5 extension.dispatch('after_epoch') assert main_loop.status['best_cost'] == 5 assert main_loop.log.current_row.get('cost_best_so_far', None) is None main_loop.status['epochs_done'] += 1 main_loop.status['iterations_done'] += 10 main_loop.log.current_row['cost'] = 4 extension.dispatch('after_epoch') assert main_loop.status['best_cost'] == 4 assert main_loop.log.current_row['cost_best_so_far']
after_epoch=True, data_stream=get_stream( which_set=which_set, for_evaluation=True, batch_size=args.batch_size, drop_prob=args.drop_prob, hidden_dim=args.num_hidden))) #, num_examples=1000))) if not args.baseline: save_str += 'BN_' if args.zoneout: save_str += 'zoneout_' + str(args.drop_prob) + '-' if args.elephant: save_str += 'elephant_' + str(args.drop_prob) + '-' extensions.extend([ TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"), #DumpBest("best_valid_training_error_rate", "best.zip"), FinishAfter(after_n_epochs=args.num_epochs), #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50), Checkpoint(save_str + "checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True), DumpLog(save_str + "log.pkl", after_epoch=True) ]) if not args.cluster: extensions.append(ProgressBar()) extensions.extend([ Timing(),
Adam(lr)])) train_monitor = TrainingDataMonitoring(variables=[cost], every_n_batches=n_batches, prefix="train") valid_monitor = DataStreamMonitoring( [cost], valid_stream, after_epoch=True, #before_first_epoch = False, prefix="valid") extensions = extensions = [ Timing(every_n_batches=n_batches), train_monitor, valid_monitor, TrackTheBest('valid_nll', after_epoch=True), Plot(save_dir + experiment_name + ".png", [['train_nll', 'valid_nll']], every_n_batches=4 * n_batches, email=True), Checkpoint(save_dir + experiment_name + ".pkl", use_cpickle=True, every_n_batches=n_batches * 8, after_epoch=True), Checkpoint(save_dir + "best_" + experiment_name + ".pkl", after_epoch=True, use_cpickle=True).add_condition( ['after_epoch'], predicate=OnLogRecord('valid_nll_best_so_far')), Printing(every_n_batches=n_batches, after_epoch=True), FinishAfter(after_n_epochs=2), SaveComputationGraph(emit)
prefix="valid") extensions = [] if args.load_experiment: extensions += [Load(os.path.join( save_dir, "pkl", "best_" + args.load_experiment + ".tar"))] extensions += [ Timing(every_n_batches=args.save_every), train_monitor] extensions += [ valid_monitor, TrackTheBest( 'valid_' + cost_name, every_n_batches=args.save_every, before_first_epoch=True), Plot( os.path.join(save_dir, "progress", exp_name + ".png"), plot_names, every_n_batches=args.save_every, email=False), Checkpoint( os.path.join(save_dir, "pkl", "best_" + exp_name + ".tar"), after_training=False, save_separately=['log'], use_cpickle=True, save_main_loop=False, before_first_epoch=True) .add_condition( ["after_batch", "before_training"],
def train_extractive_qa(new_training_job, config, save_path, params, fast_start, fuel_server, seed): if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed root_path = os.path.join(save_path, 'training_state') extension = '.tar' tar_path = root_path + extension best_tar_path = root_path + '_best' + extension c = config data, qam = initialize_data_and_model(c) if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', shuffle=True, batch_size=4, max_length=5).get_epoch_iterator(as_dict=True)) for var in qam.input_vars.values(): var.tag.test_value = test_value_data[var.name] costs = qam.apply_with_default_vars() cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(qam.contexts.shape[1], 'length') batch_size = rename(qam.contexts.shape[0], 'batch_size') predicted_begins, = VariableFilter(name='predicted_begins')(cg) predicted_ends, = VariableFilter(name='predicted_ends')(cg) exact_match, = VariableFilter(name='exact_match')(cg) exact_match_ratio = rename(exact_match.mean(), 'exact_match_ratio') context_unk_ratio, = VariableFilter(name='context_unk_ratio')(cg) monitored_vars = [ length, batch_size, cost, exact_match_ratio, context_unk_ratio ] if c['dict_path']: def_unk_ratio, = VariableFilter(name='def_unk_ratio')(cg) num_definitions = rename(qam.input_vars['defs'].shape[0], 'num_definitions') max_definition_length = rename(qam.input_vars['defs'].shape[1], 'max_definition_length') monitored_vars.extend( [def_unk_ratio, num_definitions, max_definition_length]) if c['def_word_gating'] == 'self_attention': def_gates = VariableFilter(name='def_gates')(cg) def_gates_min = tensor.minimum(*[x.min() for x in def_gates]) def_gates_max = tensor.maximum(*[x.max() for x in def_gates]) monitored_vars.extend([ rename(def_gates_min, 'def_gates_min'), rename(def_gates_max, 'def_gates_max') ]) text_match_ratio = TextMatchRatio(data_path=os.path.join( fuel.config.data_path[0], 'squad/dev-v1.1.json'), requires=[ predicted_begins, predicted_ends, tensor.ltensor3('contexts_text'), tensor.lmatrix('q_ids') ], name='text_match_ratio') parameters = cg.get_parameter_dict() trained_parameters = parameters.values() if c['embedding_path']: logger.debug("Exclude word embeddings from the trained parameters") trained_parameters = [ p for p in trained_parameters if not p == qam.embeddings_var() ] if c['train_only_def_part']: def_reading_parameters = qam.def_reading_parameters() trained_parameters = [ p for p in trained_parameters if p in def_reading_parameters ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) # apply dropout to the training cost and to all the variables # that we monitor during training train_cost = cost train_monitored_vars = list(monitored_vars) if c['dropout']: regularized_cg = ComputationGraph([cost] + train_monitored_vars) # Dima: the dropout that I implemented first bidir_outputs, = VariableFilter(bricks=[Bidirectional], roles=[OUTPUT])(cg) readout_layers = VariableFilter(bricks=[Rectifier], roles=[OUTPUT])(cg) dropout_vars = [bidir_outputs] + readout_layers logger.debug("applying dropout to {}".format(", ".join( [v.name for v in dropout_vars]))) regularized_cg = apply_dropout(regularized_cg, dropout_vars, c['dropout']) # a new dropout with exactly same mask at different steps emb_vars = VariableFilter(roles=[EMBEDDINGS])(regularized_cg) emb_dropout_mask = get_dropout_mask(emb_vars[0], c['emb_dropout']) if c['emb_dropout_type'] == 'same_mask': regularized_cg = apply_dropout2(regularized_cg, emb_vars, c['emb_dropout'], dropout_mask=emb_dropout_mask) elif c['emb_dropout_type'] == 'regular': regularized_cg = apply_dropout(regularized_cg, emb_vars, c['emb_dropout']) else: raise ValueError("unknown dropout type {}".format( c['emb_dropout_type'])) train_cost = regularized_cg.outputs[0] train_monitored_vars = regularized_cg.outputs[1:] rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=train_cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) training_stream = data.get_stream('train', batch_size=c['batch_size'], shuffle=True, max_length=c['max_length']) original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) extensions = [ LoadNoUnpickling(tar_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job), StartFuelServer(original_training_stream, os.path.join(save_path, 'stream.pkl'), before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']), TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), ] validation = DataStreamMonitoring( [text_match_ratio] + monitored_vars, data.get_stream('dev', batch_size=c['batch_size_valid'], raw_text=True, q_ids=True), prefix="dev").set_conditions(before_training=not fast_start, after_epoch=True) dump_predictions = DumpPredictions(save_path, text_match_ratio, before_training=not fast_start, after_epoch=True) track_the_best_exact = TrackTheBest( validation.record_name(exact_match_ratio), choose_best=max).set_conditions(before_training=True, after_epoch=True) track_the_best_text = TrackTheBest( validation.record_name(text_match_ratio), choose_best=max).set_conditions(before_training=True, after_epoch=True) extensions.extend([ validation, dump_predictions, track_the_best_exact, track_the_best_text ]) # We often use pretrained word embeddings and we don't want # to load and save them every time. To avoid that, we use # save_main_loop=False, we only save the trained parameters, # and we save the log and the iterations state separately # in the tar file. extensions.extend([ Checkpoint(tar_path, parameters=trained_parameters, save_main_loop=False, save_separately=['log', 'iteration_state'], before_training=not fast_start, every_n_epochs=c['save_freq_epochs'], every_n_batches=c['save_freq_batches'], after_training=not fast_start).add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best_text.notification_name), (best_tar_path, )), DumpTensorflowSummaries(save_path, after_epoch=True, every_n_batches=c['mon_freq_train'], after_training=True), RetrievalPrintStats(retrieval=data._retrieval, every_n_batches=c['mon_freq_train'], before_training=not fast_start), Printing(after_epoch=True, every_n_batches=c['mon_freq_train']), FinishAfter(after_n_batches=c['n_batches'], after_n_epochs=c['n_epochs']), Annealing(c['annealing_learning_rate'], after_n_epochs=c['annealing_start_epoch']), LoadNoUnpickling(best_tar_path, after_n_epochs=c['annealing_start_epoch']) ]) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
def main(args): """Run experiment. """ lr_tag = float_tag(args.learning_rate) x_dim, train_stream, valid_stream, test_stream = datasets.get_streams( args.data, args.batch_size) #------------------------------------------------------------ # Setup model deterministic_act = Tanh deterministic_size = 1. if args.method == 'vae': sizes_tag = args.layer_spec.replace(",", "-") layer_sizes = [int(i) for i in args.layer_spec.split(",")] layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1] name = "%s-%s-%s-lr%s-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag) if args.activation == "tanh": hidden_act = Tanh() elif args.activation == "logistic": hidden_act = Logistic() elif args.activation == "relu": hidden_act = Rectifier() else: raise "Unknown hidden nonlinearity %s" % args.hidden_act model = VAE(x_dim=x_dim, hidden_layers=layer_sizes, hidden_act=hidden_act, z_dim=z_dim, batch_norm=args.batch_normalization) model.initialize() elif args.method == 'dvae': sizes_tag = args.layer_spec.replace(",", "-") layer_sizes = [int(i) for i in args.layer_spec.split(",")] layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1] name = "%s-%s-%s-lr%s-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag) if args.activation == "tanh": hidden_act = Tanh() elif args.activation == "logistic": hidden_act = Logistic() elif args.activation == "relu": hidden_act = Rectifier() else: raise "Unknown hidden nonlinearity %s" % args.hidden_act model = DVAE(x_dim=x_dim, hidden_layers=layer_sizes, hidden_act=hidden_act, z_dim=z_dim, batch_norm=args.batch_normalization) model.initialize() elif args.method == 'rws': sizes_tag = args.layer_spec.replace(",", "-") qbase = "" if not args.no_qbaseline else "noqb-" name = "%s-%s-%s-%slr%s-dl%d-spl%d-%s" % \ (args.data, args.method, args.name, qbase, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag) p_layers, q_layers = create_layers(args.layer_spec, x_dim, args.deterministic_layers, deterministic_act, deterministic_size) model = ReweightedWakeSleep( p_layers, q_layers, qbaseline=(not args.no_qbaseline), ) model.initialize() elif args.method == 'bihm-rws': sizes_tag = args.layer_spec.replace(",", "-") name = "%s-%s-%s-lr%s-dl%d-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag) p_layers, q_layers = create_layers(args.layer_spec, x_dim, args.deterministic_layers, deterministic_act, deterministic_size) model = BiHM( p_layers, q_layers, l1reg=args.l1reg, l2reg=args.l2reg, ) model.initialize() elif args.method == 'continue': import cPickle as pickle from os.path import basename, splitext with open(args.model_file, 'rb') as f: m = pickle.load(f) if isinstance(m, MainLoop): m = m.model model = m.get_top_bricks()[0] while len(model.parents) > 0: model = model.parents[0] assert isinstance(model, (BiHM, ReweightedWakeSleep, VAE)) mname, _, _ = basename(args.model_file).rpartition("_model.pkl") name = "%s-cont-%s-lr%s-spl%s" % (mname, args.name, lr_tag, args.n_samples) else: raise ValueError("Unknown training method '%s'" % args.method) #------------------------------------------------------------ x = tensor.matrix('features') #------------------------------------------------------------ # Testset monitoring train_monitors = [] valid_monitors = [] test_monitors = [] for s in [1, 10, 100, 1000]: log_p, log_ph = model.log_likelihood(x, s) log_p = -log_p.mean() log_ph = -log_ph.mean() log_p.name = "log_p_%d" % s log_ph.name = "log_ph_%d" % s #train_monitors += [log_p, log_ph] #valid_monitors += [log_p, log_ph] test_monitors += [log_p, log_ph] #------------------------------------------------------------ # Z estimation #for s in [100000]: # z2 = tensor.exp(model.estimate_log_z2(s)) / s # z2.name = "z2_%d" % s # # valid_monitors += [z2] # test_monitors += [z2] #------------------------------------------------------------ # Gradient and training monitoring if args.method in ['vae', 'dvae']: log_p_bound, gradients = model.get_gradients(x, args.n_samples) log_p_bound = -log_p_bound.mean() log_p_bound.name = "log_p_bound" cost = log_p_bound train_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] valid_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] test_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] else: log_p, log_ph, gradients = model.get_gradients(x, args.n_samples) log_p = -log_p.mean() log_ph = -log_ph.mean() log_p.name = "log_p" log_ph.name = "log_ph" cost = log_ph train_monitors += [log_p, log_ph] valid_monitors += [log_p, log_ph] #------------------------------------------------------------ # Detailed monitoring """ n_layers = len(p_layers) log_px, w, log_p, log_q, samples = model.log_likelihood(x, n_samples) exp_samples = [] for l in xrange(n_layers): e = (w.dimshuffle(0, 1, 'x')*samples[l]).sum(axis=1) e.name = "inference_h%d" % l e.tag.aggregation_scheme = aggregation.TakeLast(e) exp_samples.append(e) s1 = samples[1] sh1 = s1.shape s1_ = s1.reshape([sh1[0]*sh1[1], sh1[2]]) s0, _ = model.p_layers[0].sample_expected(s1_) s0 = s0.reshape([sh1[0], sh1[1], s0.shape[1]]) s0 = (w.dimshuffle(0, 1, 'x')*s0).sum(axis=1) s0.name = "inference_h0^" s0.tag.aggregation_scheme = aggregation.TakeLast(s0) exp_samples.append(s0) # Draw P-samples p_samples, _, _ = model.sample_p(100) #weights = model.importance_weights(samples) #weights = weights / weights.sum() for i, s in enumerate(p_samples): s.name = "psamples_h%d" % i s.tag.aggregation_scheme = aggregation.TakeLast(s) # samples = model.sample(100, oversample=100) for i, s in enumerate(samples): s.name = "samples_h%d" % i s.tag.aggregation_scheme = aggregation.TakeLast(s) """ cg = ComputationGraph([cost]) #------------------------------------------------------------ if args.step_rule == "momentum": step_rule = Momentum(args.learning_rate, 0.95) elif args.step_rule == "rmsprop": step_rule = RMSProp(args.learning_rate) elif args.step_rule == "adam": step_rule = Adam(args.learning_rate) else: raise "Unknown step_rule %s" % args.step_rule #parameters = cg.parameters[:4] + cg.parameters[5:] parameters = cg.parameters algorithm = GradientDescent( cost=cost, parameters=parameters, gradients=gradients, step_rule=CompositeRule([ #StepClipping(25), step_rule, #RemoveNotFinite(1.0), ])) #------------------------------------------------------------ train_monitors += [ aggregation.mean(algorithm.total_gradient_norm), aggregation.mean(algorithm.total_step_norm) ] #------------------------------------------------------------ # Live plotting? plotting_extensions = [] if args.live_plotting: plotting_extensions = [ PlotManager( name, [ Plotter(channels=[[ "valid_%s" % cost.name, "valid_log_p" ], ["train_total_gradient_norm", "train_total_step_norm"]], titles=[ "validation cost", "norm of training gradient and step" ]), DisplayImage( [ WeightDisplay(model.p_layers[0].mlp. linear_transformations[0].W, n_weights=100, image_shape=(28, 28)) ] #ImageDataStreamDisplay(test_stream, image_shape=(28,28))] ) ]) ] main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), ProgressBar(), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), DataStreamMonitoring( valid_monitors, data_stream=valid_stream, prefix="valid"), DataStreamMonitoring(test_monitors, data_stream=test_stream, prefix="test", after_epoch=False, after_training=True, every_n_epochs=10), #SharedVariableModifier( # algorithm.step_rule.components[0].learning_rate, # half_lr_func, # before_training=False, # after_epoch=False, # after_batch=False, # every_n_epochs=half_lr), TrackTheBest('valid_%s' % cost.name), Checkpoint(name + ".pkl", save_separately=['log', 'model']), FinishIfNoImprovementAfter('valid_%s_best_so_far' % cost.name, epochs=args.patience), FinishAfter(after_n_epochs=args.max_epochs), Printing() ] + plotting_extensions) main_loop.run()
def initialaze_algorithm(config, save_path, bokeh_name, params, bokeh_server, bokeh, use_load_ext, load_log, fast_start, recognizer, data, model, cg, regularized_cg, cost, train_cost, parameters, max_norm_rules, observables, batch_size, batch_cost, weights_entropy, labels_mask, labels, gradients=None): primary_observables = observables secondary_observables = [] validation_observables = [] root_path, extension = os.path.splitext(save_path) train_conf = config['training'] # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) if 'adam' in rule_names: assert len(rule_names) == 1 logger.info("Using Adam for training") core_rules.append( Adam(learning_rate=train_conf.get('scale', 0.002), beta1=train_conf.get('beta1', 0.1), beta2=train_conf.get('beta2', 0.001), epsilon=train_conf.get('epsilon', 1e-8), decay_factor=train_conf.get('decay_rate', (1 - 1e-8)))) burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') #theano_func_kwargs={'mode':NanGuardMode(nan_is_error=True)}) logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances')] + weights_entropy def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name.startswith('weights_entropy'): chld_id = recognizer.child_id_from_postfix(var.name) result.append(rename(aggregation.mean(var, labels_mask[chld_id].sum()), 'weights_entropy_per_label'+ recognizer.children[chld_id].names_postfix)) elif var.name.endswith('_nll'): chld_id = recognizer.child_id_from_postfix(var.name) result.append(rename(aggregation.mean(var.sum(), labels_mask[chld_id].sum()), var.name+'_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False, **data_params_valid), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) additional_patience_notifiers = [] uas = DependencyErrorRate(recognizer.children[0], data, **config['monitoring']['search']) las = AuxiliaryErrorRates(uas, name='LAS') lab = AuxiliaryErrorRates(uas, name='LAB') per_monitoring = DataStreamMonitoring( [uas, las, lab], data.get_one_stream("valid", data.langs[0], batches=False, shuffle=False, **data_params_valid)[0], prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_uas = TrackTheBest( per_monitoring.record_name(uas)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_las = TrackTheBest( per_monitoring.record_name(las)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_lab = TrackTheBest( per_monitoring.record_name(lab)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_uas, track_the_best_las, track_the_best_lab, ] per = uas track_the_best_per = track_the_best_uas additional_patience_notifiers = [track_the_best_lab, track_the_best_las] track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500, num_stds=train_conf.get('clip_stds', 1.0))) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']), # .add_condition(["after_batch"], _gradient_norm_is_none), ] main_postfix = recognizer.children[0].names_postfix channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'+main_postfix), validation._record_name('weights_entropy_per_label'+main_postfix)], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'+main_postfix), validation._record_name('weights_penalty_per_recording'+main_postfix)]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] + additional_patience_notifiers extensions.append(Patience(**patience_conf)) if train_conf.get('min_performance_stops'): extensions.append(EarlyTermination( param_name=track_the_best_per.best_name, min_performance_by_epoch=train_conf['min_performance_stops'])) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
variables=[cost], every_n_batches = n_batches, prefix="train") valid_monitor = DataStreamMonitoring( [cost], valid_stream, after_epoch = True, #before_first_epoch = False, prefix="valid") extensions = extensions=[ Timing(every_n_batches = n_batches), train_monitor, valid_monitor, TrackTheBest('valid_sequence_log_likelihood', after_epoch = True), Plot(save_dir+experiment_name+".png", [['train_sequence_log_likelihood', 'valid_sequence_log_likelihood']], every_n_batches = 4*n_batches, email=False), Checkpoint(save_dir+experiment_name+".pkl", use_cpickle = True, every_n_batches = n_batches*8, after_epoch = True), Checkpoint(save_dir+"best_"+experiment_name+".pkl", after_epoch = True, use_cpickle = True ).add_condition(['after_epoch'], predicate=OnLogRecord('valid_sequence_log_likelihood_best_so_far')), Printing(every_n_batches = n_batches, after_epoch = True),
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(rename(gain_matrix.min(), 'min_gain')) primary_observables.append(rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) from blocks.roles import AUXILIARY l2_cost, = VariableFilter(roles=[AUXILIARY], theano_name='l2_cost_aux')(cost_cg) cost_forward, = VariableFilter(roles=[AUXILIARY], theano_name='costs_forward_aux')(cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables + [l2_cost, cost_forward]), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
algorithm.add_updates(extra_updates) train_monitor = TrainingDataMonitoring(variables=monitoring_variables + data_monitoring, every_n_batches=n_batches, prefix="train") valid_monitor = DataStreamMonitoring(monitoring_variables, valid_stream, every_n_batches=n_batches, prefix="valid") extensions = [ ProgressBar(), Timing(every_n_batches=n_batches), train_monitor, valid_monitor, TrackTheBest('valid_nll', every_n_batches=n_batches), Plot(save_dir + "progress/" + experiment_name + ".png", [['train_nll', 'valid_nll'], ['valid_learning_rate']], every_n_batches=n_batches, email=False), Checkpoint(save_dir + "pkl/best_" + experiment_name + ".pkl", use_cpickle=True).add_condition( ['after_batch'], predicate=OnLogRecord('valid_nll_best_so_far')), Printing(every_n_batches=n_batches), Flush(every_n_batches=n_batches, before_first_epoch=True), LearningRateSchedule(lr, 'valid_nll', path=save_dir + "pkl/best_" + experiment_name + ".pkl", states=states.values(),
def train(self, cost, y_hat, train_stream, accuracy=None, prediction_cost=None, regularization_cost=None, params_to_optimize=None, valid_stream=None, extra_extensions=None, model=None, vars_to_monitor_on_train=None, vars_to_monitor_on_valid=None, step_rule=None, additional_streams=None, save_on_best=None, use_own_validation=False, objects_to_dump=None): """ Generic method for training models. It extends functionality already provided by Blocks. :param cost: Theano var with cost function :param y_hat: Theano var with predictions from the model :param train_stream: Fuel stream with training data :param accuracy: Theano var with accuracy :param prediction_cost: :param regularization_cost: :param params_to_optimize: :param valid_stream: Fuel stream with validation data :param extra_extensions: :param model: :param vars_to_monitor_on_train: :param vars_to_monitor_on_valid: :param step_rule: :param additional_streams: :param save_on_best: :param use_own_validation: :param objects_to_dump: :return: """ if not vars_to_monitor_on_valid: vars_to_monitor_on_valid = [(cost, min)] if accuracy: vars_to_monitor_on_valid.append((accuracy, max)) if not save_on_best: # use default metrics for saving the best model save_on_best = [(cost, min)] if accuracy: save_on_best.append((accuracy, max)) # setup the training algorithm ####################################### # step_rule = Scale(learning_rate=0.01) # step_rule = Adam() model_save_suffix = "" if self.args.append_metaparams: model_save_suffix = "." + get_current_metaparams_str( self.parser, self.args) # get a list of variables that will be monitored during training vars_to_monitor = [cost] if accuracy: vars_to_monitor.append(accuracy) if prediction_cost: vars_to_monitor.append(prediction_cost) if regularization_cost: vars_to_monitor.append(regularization_cost) theano_vars_to_monitor = [ var for var, comparator in vars_to_monitor_on_valid ] if not params_to_optimize: # use all parameters of the model for optimization cg = ComputationGraph(cost) params_to_optimize = cg.parameters self.print_parameters_info(params_to_optimize) if not model: if accuracy: model = MultiOutputModel([cost, accuracy, y_hat] + theano_vars_to_monitor) else: model = MultiOutputModel([cost, y_hat] + theano_vars_to_monitor) if not step_rule: step_rule = AdaDelta() # learning_rate=0.02, momentum=0.9) step_rules = [ StepClipping(self.args.gradient_clip), step_rule, RemoveNotFinite() ] # optionally add gradient noise if self.args.gradient_noise: step_rules = [ GradientNoise(self.args.gradient_noise, self.args.gn_decay) ] + step_rules algorithm = GradientDescent(cost=cost, parameters=params_to_optimize, step_rule=CompositeRule(step_rules), on_unused_sources="warn") # this variable aggregates all extensions executed periodically during training extensions = [] if self.args.epochs_max: # finis training after fixed number of epochs extensions.append(FinishAfter(after_n_epochs=self.args.epochs_max)) # training data monitoring def create_training_data_monitoring(): if "every_n_epochs" in self.args.evaluate_every_n: return TrainingDataMonitoring(vars_to_monitor, prefix='train', after_epoch=True) else: return TrainingDataMonitoring(vars_to_monitor, prefix='train', after_epoch=True, **self.args.evaluate_every_n) # add extensions that monitors progress of training on train set extensions.extend([create_training_data_monitoring()]) if not self.args.disable_progress_bar: extensions.append(ProgressBar()) def add_data_stream_monitor(data_stream, prefix): if not use_own_validation: extensions.append( DataStreamMonitoring(variables=theano_vars_to_monitor, data_stream=data_stream, prefix=prefix, before_epoch=False, **self.args.evaluate_every_n)) # additional streams that should be monitored if additional_streams: for stream_name, stream in additional_streams: add_data_stream_monitor(stream, stream_name) # extra extensions need to be called before Printing extension if extra_extensions: extensions.extend(extra_extensions) if valid_stream: # add validation set monitoring add_data_stream_monitor(valid_stream, 'valid') # add best val monitoring for var, comparator in vars_to_monitor_on_valid: extensions.append( TrackTheBest("valid_" + var.name, choose_best=comparator, **self.args.evaluate_every_n)) if self.args.patience_metric == 'cost': patience_metric_name = cost.name elif self.args.patience_metric == 'accuracy': patience_metric_name = accuracy.name else: print "WARNING: Falling back to COST function for patience." patience_metric_name = cost.name extensions.append( # "valid_cost_best_so_far" message will be entered to the main loop log by TrackTheBest extension FinishIfNoImprovementAfter( "valid_" + patience_metric_name + "_best_so_far", epochs=self.args.epochs_patience_valid)) if not self.args.do_not_save: # use user provided metrics for saving valid_save_extensions = map( lambda metric_comparator: SaveTheBest( "valid_" + metric_comparator[0].name, self.args.save_path + ".best." + metric_comparator[ 0].name + model_save_suffix, choose_best=metric_comparator[1], **self.args.evaluate_every_n), save_on_best) extensions.extend(valid_save_extensions) extensions.extend([ Timing(**self.args.evaluate_every_n), Printing(after_epoch=False, **self.args.evaluate_every_n), ]) if not self.args.do_not_save or self.args.save_only_best: extensions.append( Checkpoint(self.args.save_path + model_save_suffix, **self.args.save_every_n)) extensions.append(FlushStreams(**self.args.evaluate_every_n)) # main loop ########################################################## main_loop = MainLoop(data_stream=train_stream, model=model, algorithm=algorithm, extensions=extensions) sys.setrecursionlimit(1000000) main_loop.run()
cost.name = "sequence_log_likelihood" cg = ComputationGraph(cost) model = Model(cost) ################# # Algorithm ################# algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0), Adam(lr)])) train_monitor = TrainingDataMonitoring(variables=[cost], after_epoch=True, prefix="train") extensions = extensions = [ train_monitor, TrackTheBest('train_sequence_log_likelihood'), Printing(after_epoch=True) ] main_loop = MainLoop(model=model, data_stream=data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def build_and_run(label, config): ############## CREATE THE NETWORK ############### #Define the parameters num_epochs, num_batches, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation = config[ 'num_epochs'], config['num_batches'], config['num_channels'], config[ 'image_shape'], config['filter_size'], config[ 'num_filter'], config['pooling_sizes'], config[ 'mlp_hiddens'], config['output_size'], config[ 'batch_size'], config['activation'], config[ 'mlp_activation'] # print(num_epochs, num_channels, image_shape, filter_size, num_filter, pooling_sizes, mlp_hiddens, output_size, batch_size, activation, mlp_activation) lambda_l1 = 0.000025 lambda_l2 = 0.000025 print("Building model") #Create the symbolics variable x = T.tensor4('image_features') y = T.lmatrix('targets') #Get the parameters conv_parameters = zip(filter_size, num_filter) #Create the convolutions layers conv_layers = list( interleave([(Convolutional(filter_size=filter_size, num_filters=num_filter, name='conv_{}'.format(i)) for i, (filter_size, num_filter) in enumerate(conv_parameters)), (activation), (MaxPooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) # (AveragePooling(size, name='pool_{}'.format(i)) for i, size in enumerate(pooling_sizes))])) #Create the sequence conv_sequence = ConvolutionalSequence(conv_layers, num_channels, image_size=image_shape, weights_init=Uniform(width=0.2), biases_init=Constant(0.)) #Initialize the convnet conv_sequence.initialize() #Add the MLP top_mlp_dims = [np.prod(conv_sequence.get_dim('output')) ] + mlp_hiddens + [output_size] out = Flattener().apply(conv_sequence.apply(x)) mlp = MLP(mlp_activation, top_mlp_dims, weights_init=Uniform(0, 0.2), biases_init=Constant(0.)) #Initialisze the MLP mlp.initialize() #Get the output predict = mlp.apply(out) cost = CategoricalCrossEntropy().apply(y.flatten(), predict).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') ########### REGULARIZATION ################## cg = ComputationGraph([cost]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) biases = VariableFilter(roles=[BIAS])(cg.variables) # # l2_penalty_weights = T.sum([i*lambda_l2/len(weights) * (W ** 2).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer l2_penalty = T.sum([ lambda_l2 * (W**2).sum() for i, W in enumerate(weights + biases) ]) # Gradually increase penalty for layer # # #l2_penalty_bias = T.sum([lambda_l2*(B **2).sum() for B in biases]) # # #l2_penalty = l2_penalty_weights + l2_penalty_bias l2_penalty.name = 'l2_penalty' l1_penalty = T.sum([lambda_l1 * T.abs_(z).sum() for z in weights + biases]) # l1_penalty_weights = T.sum([i*lambda_l1/len(weights) * T.abs_(W).sum() for i,W in enumerate(weights)]) # Gradually increase penalty for layer # l1_penalty_biases = T.sum([lambda_l1 * T.abs_(B).sum() for B in biases]) # l1_penalty = l1_penalty_biases + l1_penalty_weights l1_penalty.name = 'l1_penalty' costreg = cost + l2_penalty + l1_penalty costreg.name = 'costreg' ########### DEFINE THE ALGORITHM ############# # algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum()) algorithm = GradientDescent(cost=costreg, parameters=cg.parameters, step_rule=Adam()) ########### GET THE DATA ##################### istest = 'test' in config.keys() train_stream, valid_stream, test_stream = get_stream(batch_size, image_shape, test=istest) ########### INITIALIZING EXTENSIONS ########## checkpoint = Checkpoint('models/best_' + label + '.tar') checkpoint.add_condition( ['after_epoch'], predicate=OnLogRecord('valid_error_rate_best_so_far')) #Adding a live plot with the bokeh server plot = Plot( label, channels=[ ['train_error_rate', 'valid_error_rate'], ['valid_cost', 'valid_error_rate2'], # ['train_costreg','train_grad_norm']], # [ 'train_costreg', 'train_total_gradient_norm', 'train_l2_penalty', 'train_l1_penalty' ] ], server_url="http://hades.calculquebec.ca:5042") grad_norm = aggregation.mean(algorithm.total_gradient_norm) grad_norm.name = 'grad_norm' extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), DataStreamMonitoring([cost, error_rate, error_rate2], valid_stream, prefix="valid"), TrainingDataMonitoring([ costreg, error_rate, error_rate2, grad_norm, l2_penalty, l1_penalty ], prefix="train", after_epoch=True), plot, ProgressBar(), Printing(), TrackTheBest('valid_error_rate', min), #Keep best checkpoint, #Save best FinishIfNoImprovementAfter('valid_error_rate_best_so_far', epochs=4) ] # Early-stopping model = Model(cost) main_loop = MainLoop(algorithm, data_stream=train_stream, model=model, extensions=extensions) main_loop.run()