def main(): parser = buildArgsParser() args = parser.parse_args() # Extract experiments hyperparameters hyperparams = dict(vars(args)) # Remove hyperparams that should not be part of the hash del hyperparams['max_epoch'] del hyperparams['keep'] del hyperparams['force'] del hyperparams['name'] # Get/generate experiment name experiment_name = args.name if experiment_name is None: experiment_name = utils.generate_uid_from_string(repr(hyperparams)) # Create experiment folder experiment_path = pjoin(".", "experiments", experiment_name) resuming = False if os.path.isdir(experiment_path) and not args.force: resuming = True print("### Resuming experiment ({0}). ###\n".format(experiment_name)) # Check if provided hyperparams match those in the experiment folder hyperparams_loaded = utils.load_dict_from_json_file(pjoin(experiment_path, "hyperparams.json")) if hyperparams != hyperparams_loaded: print("{\n" + "\n".join(["{}: {}".format(k, hyperparams[k]) for k in sorted(hyperparams.keys())]) + "\n}") print("{\n" + "\n".join(["{}: {}".format(k, hyperparams_loaded[k]) for k in sorted(hyperparams_loaded.keys())]) + "\n}") print("The arguments provided are different than the one saved. Use --force if you are certain.\nQuitting.") sys.exit(1) else: if os.path.isdir(experiment_path): shutil.rmtree(experiment_path) os.makedirs(experiment_path) utils.save_dict_to_json_file(pjoin(experiment_path, "hyperparams.json"), hyperparams) with Timer("Loading dataset"): trainset, validset, testset = datasets.load(args.dataset) image_shape = (28, 28) nb_channels = 1 + (args.use_mask_as_input is True) batch_scheduler = MiniBatchSchedulerWithAutoregressiveMask(trainset, args.batch_size, use_mask_as_input=args.use_mask_as_input, seed=args.ordering_seed) print("{} updates per epoch.".format(len(batch_scheduler))) with Timer("Building model"): if args.use_lasagne: if args.with_residual: model = DeepConvNadeWithResidualUsingLasagne(image_shape=image_shape, nb_channels=nb_channels, convnet_blueprint=args.convnet_blueprint, fullnet_blueprint=args.fullnet_blueprint, hidden_activation=args.hidden_activation, use_mask_as_input=args.use_mask_as_input) else: model = DeepConvNadeUsingLasagne(image_shape=image_shape, nb_channels=nb_channels, convnet_blueprint=args.convnet_blueprint, fullnet_blueprint=args.fullnet_blueprint, hidden_activation=args.hidden_activation, use_mask_as_input=args.use_mask_as_input, use_batch_norm=args.batch_norm) elif args.with_residual: model = DeepConvNADEWithResidual(image_shape=image_shape, nb_channels=nb_channels, convnet_blueprint=args.convnet_blueprint, fullnet_blueprint=args.fullnet_blueprint, hidden_activation=args.hidden_activation, use_mask_as_input=args.use_mask_as_input) else: builder = DeepConvNADEBuilder(image_shape=image_shape, nb_channels=nb_channels, hidden_activation=args.hidden_activation, use_mask_as_input=args.use_mask_as_input) if args.blueprints_seed is not None: convnet_blueprint, fullnet_blueprint = generate_blueprints(args.blueprint_seed, image_shape[0]) builder.build_convnet_from_blueprint(convnet_blueprint) builder.build_fullnet_from_blueprint(fullnet_blueprint) else: if args.convnet_blueprint is not None: builder.build_convnet_from_blueprint(args.convnet_blueprint) if args.fullnet_blueprint is not None: builder.build_fullnet_from_blueprint(args.fullnet_blueprint) model = builder.build() # print(str(model.convnet)) # print(str(model.fullnet)) model.initialize(weigths_initializer_factory(args.weights_initialization, seed=args.initialization_seed)) print(str(model)) with Timer("Building optimizer"): loss = BinaryCrossEntropyEstimateWithAutoRegressiveMask(model, trainset) optimizer = optimizer_factory(hyperparams, loss) with Timer("Building trainer"): trainer = Trainer(optimizer, batch_scheduler) if args.max_epoch is not None: trainer.append_task(stopping_criteria.MaxEpochStopping(args.max_epoch)) # Print time for one epoch trainer.append_task(tasks.PrintEpochDuration()) trainer.append_task(tasks.PrintTrainingDuration()) # Log training error loss_monitor = views.MonitorVariable(loss.loss) avg_loss = tasks.AveragePerEpoch(loss_monitor) accum = tasks.Accumulator(loss_monitor) logger = tasks.Logger(loss_monitor, avg_loss) trainer.append_task(logger, avg_loss, accum) # Print average training loss. trainer.append_task(tasks.Print("Avg. training loss: : {}", avg_loss)) # Print NLL mean/stderror. model.deterministic = True # For batch normalization, see https://github.com/Lasagne/Lasagne/blob/master/lasagne/layers/normalization.py#L198 nll = views.LossView(loss=BinaryCrossEntropyEstimateWithAutoRegressiveMask(model, validset), batch_scheduler=MiniBatchSchedulerWithAutoregressiveMask(validset, batch_size=0.1*len(validset), use_mask_as_input=args.use_mask_as_input, keep_mask=True, seed=args.ordering_seed+1)) # trainer.append_task(tasks.Print("Validset - NLL : {0:.2f} ± {1:.2f}", nll.mean, nll.stderror, each_k_update=100)) trainer.append_task(tasks.Print("Validset - NLL : {0:.2f} ± {1:.2f}", nll.mean, nll.stderror)) # direction_norm = views.MonitorVariable(T.sqrt(sum(map(lambda d: T.sqr(d).sum(), loss.gradients.values())))) # trainer.append_task(tasks.Print("||d|| : {0:.4f}", direction_norm, each_k_update=50)) # Save training progression def save_model(*args): trainer.save(experiment_path) trainer.append_task(stopping_criteria.EarlyStopping(nll.mean, lookahead=args.lookahead, eps=args.lookahead_eps, callback=save_model)) trainer.build_theano_graph() if resuming: with Timer("Loading"): trainer.load(experiment_path) with Timer("Training"): trainer.train() trainer.save(experiment_path) model.save(experiment_path)
def main(): parser = build_argparser() args = parser.parse_args() print(args) print("Using Theano v.{}".format(theano.version.short_version)) hyperparams_to_exclude = ['max_epoch', 'force', 'name', 'view', 'shuffle_streamlines'] # Use this for hyperparams added in a new version, but nonexistent from older versions retrocompatibility_defaults = {'feed_previous_direction': False, 'predict_offset': False, 'normalize': False, 'sort_streamlines': False, 'keep_step_size': False, 'use_layer_normalization': False, 'drop_prob': 0., 'use_zoneout': False, 'skip_connections': False} experiment_path, hyperparams, resuming = utils.maybe_create_experiment_folder(args, exclude=hyperparams_to_exclude, retrocompatibility_defaults=retrocompatibility_defaults) # Log the command currently running. with open(pjoin(experiment_path, 'cmd.txt'), 'a') as f: f.write(" ".join(sys.argv) + "\n") print("Resuming:" if resuming else "Creating:", experiment_path) with Timer("Loading dataset", newline=True): trainset_volume_manager = VolumeManager() validset_volume_manager = VolumeManager() trainset = datasets.load_tractography_dataset(args.train_subjects, trainset_volume_manager, name="trainset", use_sh_coeffs=args.use_sh_coeffs) validset = datasets.load_tractography_dataset(args.valid_subjects, validset_volume_manager, name="validset", use_sh_coeffs=args.use_sh_coeffs) print("Dataset sizes:", len(trainset), " |", len(validset)) batch_scheduler = batch_scheduler_factory(hyperparams, dataset=trainset, train_mode=True) print("An epoch will be composed of {} updates.".format(batch_scheduler.nb_updates_per_epoch)) print(trainset_volume_manager.data_dimension, args.hidden_sizes, batch_scheduler.target_size) with Timer("Creating model"): input_size = trainset_volume_manager.data_dimension if hyperparams['feed_previous_direction']: input_size += 3 model = model_factory(hyperparams, input_size=input_size, output_size=batch_scheduler.target_size, volume_manager=trainset_volume_manager) model.initialize(weigths_initializer_factory(args.weights_initialization, seed=args.initialization_seed)) with Timer("Building optimizer"): loss = loss_factory(hyperparams, model, trainset) if args.clip_gradient is not None: loss.append_gradient_modifier(DirectionClipping(threshold=args.clip_gradient)) optimizer = optimizer_factory(hyperparams, loss) with Timer("Building trainer"): trainer = Trainer(optimizer, batch_scheduler) # Log training error loss_monitor = views.MonitorVariable(loss.loss) avg_loss = tasks.AveragePerEpoch(loss_monitor) trainer.append_task(avg_loss) # Print average training loss. trainer.append_task(tasks.Print("Avg. training loss: : {}", avg_loss)) # if args.learn_to_stop: # l2err_monitor = views.MonitorVariable(T.mean(loss.mean_sqr_error)) # avg_l2err = tasks.AveragePerEpoch(l2err_monitor) # trainer.append_task(avg_l2err) # # crossentropy_monitor = views.MonitorVariable(T.mean(loss.cross_entropy)) # avg_crossentropy = tasks.AveragePerEpoch(crossentropy_monitor) # trainer.append_task(avg_crossentropy) # # trainer.append_task(tasks.Print("Avg. training L2 err: : {}", avg_l2err)) # trainer.append_task(tasks.Print("Avg. training stopping: : {}", avg_crossentropy)) # trainer.append_task(tasks.Print("L2 err : {0:.4f}", l2err_monitor, each_k_update=100)) # trainer.append_task(tasks.Print("stopping : {0:.4f}", crossentropy_monitor, each_k_update=100)) # Print NLL mean/stderror. # train_loss = L2DistanceForSequences(model, trainset) # train_batch_scheduler = StreamlinesBatchScheduler(trainset, batch_size=1000, # noisy_streamlines_sigma=None, # nb_updates_per_epoch=None, # seed=1234) # train_error = views.LossView(loss=train_loss, batch_scheduler=train_batch_scheduler) # trainer.append_task(tasks.Print("Trainset - Error : {0:.2f} | {1:.2f}", train_error.sum, train_error.mean)) # HACK: To make sure all subjects in the volume_manager are used in a batch, we have to split the trainset/validset in 2 volume managers model.volume_manager = validset_volume_manager model.drop_prob = 0. # Do not use dropout/zoneout for evaluation valid_loss = loss_factory(hyperparams, model, validset) valid_batch_scheduler = batch_scheduler_factory(hyperparams, dataset=validset, train_mode=False) valid_error = views.LossView(loss=valid_loss, batch_scheduler=valid_batch_scheduler) trainer.append_task(tasks.Print("Validset - Error : {0:.2f} | {1:.2f}", valid_error.sum, valid_error.mean)) if hyperparams['model'] == 'ffnn_regression': valid_batch_scheduler2 = batch_scheduler_factory(hyperparams, dataset=validset, train_mode=False) valid_l2 = loss_factory(hyperparams, model, validset, loss_type="expected_value") valid_l2_error = views.LossView(loss=valid_l2, batch_scheduler=valid_batch_scheduler2) trainer.append_task(tasks.Print("Validset - {}".format(valid_l2.__class__.__name__) + "\t: {0:.2f} | {1:.2f}", valid_l2_error.sum, valid_l2_error.mean)) # HACK: Restore trainset volume manager model.volume_manager = trainset_volume_manager model.drop_prob = hyperparams['drop_prob'] # Restore dropout lookahead_loss = valid_error.sum direction_norm = views.MonitorVariable(T.sqrt(sum(map(lambda d: T.sqr(d).sum(), loss.gradients.values())))) # trainer.append_task(tasks.Print("||d|| : {0:.4f}", direction_norm)) # logger = tasks.Logger(train_error.mean, valid_error.mean, valid_error.sum, direction_norm) logger = tasks.Logger(valid_error.mean, valid_error.sum, direction_norm) trainer.append_task(logger) if args.view: import pylab as plt def _plot(*args, **kwargs): plt.figure(1) plt.clf() plt.show(False) plt.subplot(121) plt.plot(np.array(logger.get_variable_history(0)).flatten(), label="Train") plt.plot(np.array(logger.get_variable_history(1)).flatten(), label="Valid") plt.legend() plt.subplot(122) plt.plot(np.array(logger.get_variable_history(3)).flatten(), label="||d'||") plt.draw() trainer.append_task(tasks.Callback(_plot)) # Callback function to stop training if NaN is detected. def detect_nan(obj, status): if np.isnan(model.parameters[0].get_value().sum()): print("NaN detected! Stopping training now.") sys.exit() trainer.append_task(tasks.Callback(detect_nan, each_k_update=1)) # Callback function to save training progression. def save_training(obj, status): trainer.save(experiment_path) trainer.append_task(tasks.Callback(save_training)) # Early stopping with a callback for saving every time model improves. def save_improvement(obj, status): """ Save best model and training progression. """ if np.isnan(model.parameters[0].get_value().sum()): print("NaN detected! Not saving the model. Crashing now.") sys.exit() print("*** Best epoch: {0} ***\n".format(obj.best_epoch)) model.save(experiment_path) # Print time for one epoch trainer.append_task(tasks.PrintEpochDuration()) trainer.append_task(tasks.PrintTrainingDuration()) trainer.append_task(tasks.PrintTime(each_k_update=100)) # Profiling # Add stopping criteria trainer.append_task(stopping_criteria.MaxEpochStopping(args.max_epoch)) early_stopping = stopping_criteria.EarlyStopping(lookahead_loss, lookahead=args.lookahead, eps=args.lookahead_eps, callback=save_improvement) trainer.append_task(early_stopping) with Timer("Compiling Theano graph"): trainer.build_theano_graph() if resuming: if not os.path.isdir(pjoin(experiment_path, 'training')): print("No 'training/' folder. Assuming it failed before" " the end of the first epoch. Starting a new training.") else: with Timer("Loading"): trainer.load(experiment_path) with Timer("Training"): trainer.train()
def main(): parser = build_argparser() args = parser.parse_args() print(args) print("Using Theano v.{}".format(theano.version.short_version)) hyperparams_to_exclude = ['max_epoch', 'force', 'name', 'view', 'shuffle_streamlines'] # Use this for hyperparams added in a new version, but nonexistent from older versions retrocompatibility_defaults = {'feed_previous_direction': False, 'normalize': False} experiment_path, hyperparams, resuming = utils.maybe_create_experiment_folder(args, exclude=hyperparams_to_exclude, retrocompatibility_defaults=retrocompatibility_defaults) # Log the command currently running. with open(pjoin(experiment_path, 'cmd.txt'), 'a') as f: f.write(" ".join(sys.argv) + "\n") print("Resuming:" if resuming else "Creating:", experiment_path) with Timer("Loading dataset", newline=True): trainset_volume_manager = VolumeManager() validset_volume_manager = VolumeManager() trainset = datasets.load_tractography_dataset(args.train_subjects, trainset_volume_manager, name="trainset", use_sh_coeffs=args.use_sh_coeffs) validset = datasets.load_tractography_dataset(args.valid_subjects, validset_volume_manager, name="validset", use_sh_coeffs=args.use_sh_coeffs) print("Dataset sizes:", len(trainset), " |", len(validset)) if args.view: tsne_view(trainset, trainset_volume_manager) sys.exit(0) batch_scheduler = batch_scheduler_factory(hyperparams, dataset=trainset, train_mode=True) print("An epoch will be composed of {} updates.".format(batch_scheduler.nb_updates_per_epoch)) print(trainset_volume_manager.data_dimension, args.hidden_sizes, batch_scheduler.target_size) with Timer("Creating model"): input_size = trainset_volume_manager.data_dimension if hyperparams['feed_previous_direction']: input_size += 3 model = model_factory(hyperparams, input_size=input_size, output_size=batch_scheduler.target_size, volume_manager=trainset_volume_manager) model.initialize(weigths_initializer_factory(args.weights_initialization, seed=args.initialization_seed)) with Timer("Building optimizer"): loss = loss_factory(hyperparams, model, trainset) if args.clip_gradient is not None: loss.append_gradient_modifier(DirectionClipping(threshold=args.clip_gradient)) optimizer = optimizer_factory(hyperparams, loss) with Timer("Building trainer"): trainer = Trainer(optimizer, batch_scheduler) # Log training error loss_monitor = views.MonitorVariable(loss.loss) avg_loss = tasks.AveragePerEpoch(loss_monitor) trainer.append_task(avg_loss) # Print average training loss. trainer.append_task(tasks.Print("Avg. training loss: : {}", avg_loss)) # if args.learn_to_stop: # l2err_monitor = views.MonitorVariable(T.mean(loss.mean_sqr_error)) # avg_l2err = tasks.AveragePerEpoch(l2err_monitor) # trainer.append_task(avg_l2err) # # crossentropy_monitor = views.MonitorVariable(T.mean(loss.cross_entropy)) # avg_crossentropy = tasks.AveragePerEpoch(crossentropy_monitor) # trainer.append_task(avg_crossentropy) # # trainer.append_task(tasks.Print("Avg. training L2 err: : {}", avg_l2err)) # trainer.append_task(tasks.Print("Avg. training stopping: : {}", avg_crossentropy)) # trainer.append_task(tasks.Print("L2 err : {0:.4f}", l2err_monitor, each_k_update=100)) # trainer.append_task(tasks.Print("stopping : {0:.4f}", crossentropy_monitor, each_k_update=100)) # Print NLL mean/stderror. # train_loss = L2DistanceForSequences(model, trainset) # train_batch_scheduler = StreamlinesBatchScheduler(trainset, batch_size=1000, # noisy_streamlines_sigma=None, # nb_updates_per_epoch=None, # seed=1234) # train_error = views.LossView(loss=train_loss, batch_scheduler=train_batch_scheduler) # trainer.append_task(tasks.Print("Trainset - Error : {0:.2f} | {1:.2f}", train_error.sum, train_error.mean)) # HACK: To make sure all subjects in the volume_manager are used in a batch, we have to split the trainset/validset in 2 volume managers model.volume_manager = validset_volume_manager valid_loss = loss_factory(hyperparams, model, validset) valid_batch_scheduler = batch_scheduler_factory(hyperparams, dataset=validset, train_mode=False) valid_error = views.LossView(loss=valid_loss, batch_scheduler=valid_batch_scheduler) trainer.append_task(tasks.Print("Validset - Error : {0:.2f} | {1:.2f}", valid_error.sum, valid_error.mean)) # HACK: Restore trainset volume manager model.volume_manager = trainset_volume_manager lookahead_loss = valid_error.sum direction_norm = views.MonitorVariable(T.sqrt(sum(map(lambda d: T.sqr(d).sum(), loss.gradients.values())))) # trainer.append_task(tasks.Print("||d|| : {0:.4f}", direction_norm)) # logger = tasks.Logger(train_error.mean, valid_error.mean, valid_error.sum, direction_norm) logger = tasks.Logger(valid_error.mean, valid_error.sum, direction_norm) trainer.append_task(logger) if args.view: import pylab as plt def _plot(*args, **kwargs): plt.figure(1) plt.clf() plt.show(False) plt.subplot(121) plt.plot(np.array(logger.get_variable_history(0)).flatten(), label="Train") plt.plot(np.array(logger.get_variable_history(1)).flatten(), label="Valid") plt.legend() plt.subplot(122) plt.plot(np.array(logger.get_variable_history(3)).flatten(), label="||d'||") plt.draw() trainer.append_task(tasks.Callback(_plot)) # Callback function to stop training if NaN is detected. def detect_nan(obj, status): if np.isnan(model.parameters[0].get_value().sum()): print("NaN detected! Stopping training now.") sys.exit() trainer.append_task(tasks.Callback(detect_nan, each_k_update=1)) # Callback function to save training progression. def save_training(obj, status): trainer.save(experiment_path) trainer.append_task(tasks.Callback(save_training)) # Early stopping with a callback for saving every time model improves. def save_improvement(obj, status): """ Save best model and training progression. """ if np.isnan(model.parameters[0].get_value().sum()): print("NaN detected! Not saving the model. Crashing now.") sys.exit() print("*** Best epoch: {0} ***\n".format(obj.best_epoch)) model.save(experiment_path) # Print time for one epoch trainer.append_task(tasks.PrintEpochDuration()) trainer.append_task(tasks.PrintTrainingDuration()) trainer.append_task(tasks.PrintTime(each_k_update=100)) # Profiling # Add stopping criteria trainer.append_task(stopping_criteria.MaxEpochStopping(args.max_epoch)) early_stopping = stopping_criteria.EarlyStopping(lookahead_loss, lookahead=args.lookahead, eps=args.lookahead_eps, callback=save_improvement) trainer.append_task(early_stopping) with Timer("Compiling Theano graph"): trainer.build_theano_graph() if resuming: if not os.path.isdir(pjoin(experiment_path, 'training')): print("No 'training/' folder. Assuming it failed before" " the end of the first epoch. Starting a new training.") else: with Timer("Loading"): trainer.load(experiment_path) with Timer("Training"): trainer.train()
def main(): parser = build_argparser() args = parser.parse_args() print(args) print("Using Theano v.{}".format(theano.version.short_version)) hyperparams_to_exclude = ['max_epoch', 'force', 'name', 'view'] # Use this for hyperparams added in a new version, but nonexistent from older versions retrocompatibility_defaults = {'use_layer_normalization': False} experiment_path, hyperparams, resuming = utils.maybe_create_experiment_folder( args, exclude=hyperparams_to_exclude, retrocompatibility_defaults=retrocompatibility_defaults) # Log the command currently running. with open(pjoin(experiment_path, 'cmd.txt'), 'a') as f: f.write(" ".join(sys.argv) + "\n") print("Resuming:" if resuming else "Creating:", experiment_path) with Timer("Loading dataset", newline=True): trainset_volume_manager = VolumeManager() validset_volume_manager = VolumeManager() trainset = datasets.load_mask_classifier_dataset( args.train_subjects, trainset_volume_manager, name="trainset", use_sh_coeffs=args.use_sh_coeffs) validset = datasets.load_mask_classifier_dataset( args.valid_subjects, validset_volume_manager, name="validset", use_sh_coeffs=args.use_sh_coeffs) print("Dataset sizes:", len(trainset), " |", len(validset)) batch_scheduler = MaskClassifierBatchScheduler( trainset, hyperparams['batch_size'], seed=hyperparams['seed']) print("An epoch will be composed of {} updates.".format( batch_scheduler.nb_updates_per_epoch)) print(trainset_volume_manager.data_dimension, args.hidden_sizes, batch_scheduler.target_size) with Timer("Creating model"): input_size = trainset_volume_manager.data_dimension model = FFNN_Classification(trainset_volume_manager, input_size, hyperparams['hidden_sizes']) model.initialize( weigths_initializer_factory(args.weights_initialization, seed=args.initialization_seed)) with Timer("Building optimizer"): loss = BinaryCrossEntropy(model, trainset) if args.clip_gradient is not None: loss.append_gradient_modifier( DirectionClipping(threshold=args.clip_gradient)) optimizer = optimizer_factory(hyperparams, loss) with Timer("Building trainer"): trainer = Trainer(optimizer, batch_scheduler) # Log training error loss_monitor = views.MonitorVariable(loss.loss) avg_loss = tasks.AveragePerEpoch(loss_monitor) trainer.append_task(avg_loss) # Print average training loss. trainer.append_task( tasks.Print("Avg. training loss: : {}", avg_loss)) # HACK: To make sure all subjects in the volume_manager are used in a batch, we have to split the trainset/validset in 2 volume managers model.volume_manager = validset_volume_manager valid_loss = BinaryCrossEntropy(model, validset) valid_batch_scheduler = MaskClassifierBatchScheduler( validset, hyperparams['batch_size'], seed=hyperparams['seed']) valid_error = views.LossView(loss=valid_loss, batch_scheduler=valid_batch_scheduler) trainer.append_task( tasks.Print("Validset - Error : {0:.2f} | {1:.2f}", valid_error.sum, valid_error.mean)) # HACK: Restore trainset volume manager model.volume_manager = trainset_volume_manager lookahead_loss = valid_error.sum direction_norm = views.MonitorVariable( T.sqrt(sum(map(lambda d: T.sqr(d).sum(), loss.gradients.values())))) # trainer.append_task(tasks.Print("||d|| : {0:.4f}", direction_norm)) # logger = tasks.Logger(train_error.mean, valid_error.mean, valid_error.sum, direction_norm) logger = tasks.Logger(valid_error.mean, valid_error.sum, direction_norm) trainer.append_task(logger) # Callback function to stop training if NaN is detected. def detect_nan(obj, status): if np.isnan(model.parameters[0].get_value().sum()): print("NaN detected! Stopping training now.") sys.exit() trainer.append_task(tasks.Callback(detect_nan, each_k_update=1)) # Callback function to save training progression. def save_training(obj, status): trainer.save(experiment_path) trainer.append_task(tasks.Callback(save_training)) # Early stopping with a callback for saving every time model improves. def save_improvement(obj, status): """ Save best model and training progression. """ if np.isnan(model.parameters[0].get_value().sum()): print("NaN detected! Not saving the model. Crashing now.") sys.exit() print("*** Best epoch: {0} ***\n".format(obj.best_epoch)) model.save(experiment_path) # Print time for one epoch trainer.append_task(tasks.PrintEpochDuration()) trainer.append_task(tasks.PrintTrainingDuration()) trainer.append_task(tasks.PrintTime(each_k_update=100)) # Profiling # Add stopping criteria trainer.append_task(stopping_criteria.MaxEpochStopping(args.max_epoch)) early_stopping = stopping_criteria.EarlyStopping( lookahead_loss, lookahead=args.lookahead, eps=args.lookahead_eps, callback=save_improvement) trainer.append_task(early_stopping) with Timer("Compiling Theano graph"): trainer.build_theano_graph() if resuming: if not os.path.isdir(pjoin(experiment_path, 'training')): print("No 'training/' folder. Assuming it failed before" " the end of the first epoch. Starting a new training.") else: with Timer("Loading"): trainer.load(experiment_path) with Timer("Training"): trainer.train()