def mpi_train(conf, shot_list_train, shot_list_validate, loader, callbacks_list=None, shot_list_test=None): loader.set_inference_mode(False) # TODO(KGF): this is not defined in conf.yaml, but added to processed dict # for the first time here: conf['num_workers'] = g.comm.Get_size() specific_builder = builder.ModelBuilder(conf) if g.tf_ver >= parse_version('1.14.0'): # Internal TensorFlow flags, subject to change (v1.14.0+ only?) try: from tensorflow.python.util import module_wrapper as depr except ImportError: from tensorflow.python.util import deprecation_wrapper as depr # depr._PRINT_DEPRECATION_WARNINGS = False # does nothing depr._PER_MODULE_WARNING_LIMIT = 0 # Suppresses warnings from "keras/backend/tensorflow_backend.py" # except: "Rate should be set to `rate = 1 - keep_prob`" # Also suppresses warnings from "keras/optimizers.py # does NOT suppresses warn from "/tensorflow/python/ops/math_grad.py" else: # TODO(KGF): next line suppresses ALL info and warning messages, # not just deprecation warnings... tf.logging.set_verbosity(tf.logging.ERROR) # TODO(KGF): for TF>v1.13.0 (esp v1.14.0), this next line prompts a ton of # deprecation warnings with externally-packaged Keras, e.g.: # WARNING:tensorflow:From .../keras/backend/tensorflow_backend.py:174: # The name tf.get_default_session is deprecated. # Please use tf.compat.v1.get_default_session instead. train_model = specific_builder.build_model(False) # Cannot fix these Keras internals via "import tensorflow.compat.v1 as tf" # # TODO(KGF): note, these are different than C-based info diagnostics e.g.: # 2019-11-06 18:27:31.698908: I ... dynamic library libcublas.so.10 # which are NOT suppressed by set_verbosity. See top level __init__.py # load the latest epoch we did. Returns 0 if none exist yet e = specific_builder.load_model_weights(train_model) e_old = e num_epochs = conf['training']['num_epochs'] lr_decay = conf['model']['lr_decay'] batch_size = conf['training']['batch_size'] lr = conf['model']['lr'] clipnorm = conf['model']['clipnorm'] warmup_steps = conf['model']['warmup_steps'] # TODO(KGF): rename as "num_iter_minimum" or "min_steps_per_epoch" num_batches_minimum = conf['training']['num_batches_minimum'] if 'adam' in conf['model']['optimizer']: optimizer = MPIAdam(lr=lr) elif (conf['model']['optimizer'] == 'sgd' or conf['model']['optimizer'] == 'tf_sgd'): optimizer = MPISGD(lr=lr) elif 'momentum_sgd' in conf['model']['optimizer']: optimizer = MPIMomentumSGD(lr=lr) else: print("Optimizer not implemented yet") exit(1) g.print_unique('{} epoch(s) left to go'.format(num_epochs - e)) batch_generator = partial(loader.training_batch_generator_partial_reset, shot_list=shot_list_train) g.print_unique("warmup steps = {}".format(warmup_steps)) mpi_model = MPIModel(train_model, optimizer, g.comm, batch_generator, batch_size, lr=lr, warmup_steps=warmup_steps, num_batches_minimum=num_batches_minimum, conf=conf) mpi_model.compile(conf['model']['optimizer'], clipnorm, conf['data']['target'].loss) tensorboard = None if g.task_index == 0: tensorboard_save_path = conf['paths']['tensorboard_save_path'] write_grads = conf['callbacks']['write_grads'] tensorboard = TensorBoard(log_dir=tensorboard_save_path, histogram_freq=1, write_graph=True, write_grads=write_grads) tensorboard.set_model(mpi_model.model) # TODO(KGF): check addition of TF model summary write added from fork fr = open('model_architecture.log', 'a') ori = sys.stdout sys.stdout = fr mpi_model.model.summary() sys.stdout = ori fr.close() mpi_model.model.summary() if g.task_index == 0: callbacks = mpi_model.build_callbacks(conf, callbacks_list) callbacks.set_model(mpi_model.model) callback_metrics = conf['callbacks']['metrics'] callbacks.set_params({ 'epochs': num_epochs, 'metrics': callback_metrics, 'batch_size': batch_size, }) callbacks.on_train_begin() if conf['callbacks']['mode'] == 'max': best_so_far = -np.inf cmp_fn = max else: best_so_far = np.inf cmp_fn = min while e < num_epochs: g.write_unique('\nBegin training from epoch {:.2f}/{}'.format( e, num_epochs)) if g.task_index == 0: callbacks.on_epoch_begin(int(round(e))) mpi_model.set_lr(lr * lr_decay**e) # KGF: core work of loop performed in next line (step, ave_loss, curr_loss, num_so_far, effective_epochs) = mpi_model.train_epoch() e = e_old + effective_epochs g.write_unique('Finished training of epoch {:.2f}/{}\n'.format( e, num_epochs)) # TODO(KGF): add diagnostic about "saving to epoch X"? loader.verbose = False # True during the first iteration if g.task_index == 0: specific_builder.save_model_weights(train_model, int(round(e))) if conf['training']['no_validation']: break epoch_logs = {} g.write_unique('Begin evaluation of epoch {:.2f}/{}\n'.format( e, num_epochs)) # TODO(KGF): flush output/ MPI barrier? # g.flush_all_inorder() # TODO(KGF): is there a way to avoid Keras.Models.load_weights() # repeated calls throughout mpi_make_pred*() fn calls? _, _, _, roc_area, loss = mpi_make_predictions_and_evaluate( conf, shot_list_validate, loader) if conf['training']['ranking_difficulty_fac'] != 1.0: (_, _, _, roc_area_train, loss_train) = mpi_make_predictions_and_evaluate( conf, shot_list_train, loader) batch_generator = partial( loader.training_batch_generator_partial_reset, shot_list=shot_list_train) mpi_model.batch_iterator = batch_generator mpi_model.batch_iterator_func.__exit__() mpi_model.num_so_far_accum = mpi_model.num_so_far_indiv mpi_model.set_batch_iterator_func() if ('monitor_test' in conf['callbacks'].keys() and conf['callbacks']['monitor_test']): times = conf['callbacks']['monitor_times'] areas, _ = mpi_make_predictions_and_evaluate_multiple_times( conf, shot_list_validate, loader, times) epoch_str = 'epoch {}, '.format(int(round(e))) g.write_unique(epoch_str + ' '.join([ 'val_roc_{} = {}'.format(t, roc) for t, roc in zip(times, areas) ]) + '\n') if shot_list_test is not None: areas, _ = mpi_make_predictions_and_evaluate_multiple_times( conf, shot_list_test, loader, times) g.write_unique(epoch_str + ' '.join([ 'test_roc_{} = {}'.format(t, roc) for t, roc in zip(times, areas) ]) + '\n') epoch_logs['val_roc'] = roc_area epoch_logs['val_loss'] = loss epoch_logs['train_loss'] = ave_loss best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']], best_so_far) stop_training = False g.flush_all_inorder() if g.task_index == 0: print('=========Summary======== for epoch {:.2f}'.format(e)) print('Training Loss numpy: {:.3e}'.format(ave_loss)) print('Validation Loss: {:.3e}'.format(loss)) print('Validation ROC: {:.4f}'.format(roc_area)) if conf['training']['ranking_difficulty_fac'] != 1.0: print('Training Loss: {:.3e}'.format(loss_train)) print('Training ROC: {:.4f}'.format(roc_area_train)) print('======================== ') callbacks.on_epoch_end(int(round(e)), epoch_logs) if hasattr(mpi_model.model, 'stop_training'): stop_training = mpi_model.model.stop_training # only save model weights if quantity we are tracking is improving if best_so_far != epoch_logs[conf['callbacks']['monitor']]: if ('monitor_test' in conf['callbacks'].keys() and conf['callbacks']['monitor_test']): print("No improvement, saving model weights anyways") else: print("Not saving model weights") specific_builder.delete_model_weights( train_model, int(round(e))) # tensorboard val_generator = partial(loader.training_batch_generator, shot_list=shot_list_validate)() val_steps = 1 tensorboard.on_epoch_end(val_generator, val_steps, int(round(e)), epoch_logs) stop_training = g.comm.bcast(stop_training, root=0) g.write_unique('Finished evaluation of epoch {:.2f}/{}'.format( e, num_epochs)) # TODO(KGF): compare to old diagnostic: # g.write_unique("end epoch {}".format(e_old)) if stop_training: g.write_unique("Stopping training due to early stopping") break if g.task_index == 0: callbacks.on_train_end() tensorboard.on_train_end() mpi_model.close()
loader = Loader(conf, normalizer) g.print_unique("...done") # TODO(KGF): both preprocess.py and normalize.py are littered with print() # calls that should probably be replaced with print_unique() when they are not # purely loading previously-computed quantities from file # (or we can continue to ensure that they are only ever executed by 1 rank) ##################################################### # TRAINING # ##################################################### # Prevent Keras TF backend deprecation messages from mpi_train() from # appearing jumbled with stdout, stderr msgs from above steps g.comm.Barrier() g.flush_all_inorder() # reminder: ensure training has a separate random seed for every worker if not only_predict: mpi_train(conf, shot_list_train, shot_list_validate, loader, shot_list_test=shot_list_test) g.flush_all_inorder() ##################################################### # TESTING # ##################################################### # load last model for testing
from copy import deepcopy # import socket sys.setrecursionlimit(10000) # TODO(KGF): remove the next 3 lines? # import keras sequentially because it otherwise reads from ~/.keras/keras.json # with too many threads: # from mpi_launch_tensorflow import get_mpi_task_index # set global variables for entire module regarding MPI & GPU environment g.init_GPU_backend(conf) # moved this fn/init call to client-facing mpi_learn.py # g.init_MPI() # TODO(KGF): set "mpi_initialized" global bool flag? g.flush_all_inorder() # see above about conf_parser.py stdout writes # initialization code for mpi_runner.py module: if g.backend == 'tf' or g.backend == 'tensorflow': if g.NUM_GPUS > 1: os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(g.MY_GPU) # ,mode=NanGuardMode' os.environ['KERAS_BACKEND'] = 'tensorflow' # default setting g.tf_ver = parse_version(get_distribution('tensorflow').version) # compat/compat.py first committed on 2018-06-29 for Py 2 vs 3 # (around, but not present in, the release of v1.9.0) # v2 compatiblity code added, then moved from compat.py in Nov and Dec 2018 # compat.v1 first mentioned in RELEASE.md in v1.13.0. # But many TF deprecation warnings in 1.14.0, e.g.: # "The name tf.GPUOptions is deprecated. Please use tf.compat.v1.GPUOptions # instead". See tf_export.py