Esempio n. 1
0
def mpi_train(conf,
              shot_list_train,
              shot_list_validate,
              loader,
              callbacks_list=None,
              shot_list_test=None):
    loader.set_inference_mode(False)

    # TODO(KGF): this is not defined in conf.yaml, but added to processed dict
    # for the first time here:
    conf['num_workers'] = g.comm.Get_size()

    specific_builder = builder.ModelBuilder(conf)
    if g.tf_ver >= parse_version('1.14.0'):
        # Internal TensorFlow flags, subject to change (v1.14.0+ only?)
        try:
            from tensorflow.python.util import module_wrapper as depr
        except ImportError:
            from tensorflow.python.util import deprecation_wrapper as depr
        # depr._PRINT_DEPRECATION_WARNINGS = False  # does nothing
        depr._PER_MODULE_WARNING_LIMIT = 0
        # Suppresses warnings from "keras/backend/tensorflow_backend.py"
        # except: "Rate should be set to `rate = 1 - keep_prob`"
        # Also suppresses warnings from "keras/optimizers.py
        # does NOT suppresses warn from "/tensorflow/python/ops/math_grad.py"
    else:
        # TODO(KGF): next line suppresses ALL info and warning messages,
        # not just deprecation warnings...
        tf.logging.set_verbosity(tf.logging.ERROR)
    # TODO(KGF): for TF>v1.13.0 (esp v1.14.0), this next line prompts a ton of
    # deprecation warnings with externally-packaged Keras, e.g.:
    # WARNING:tensorflow:From  .../keras/backend/tensorflow_backend.py:174:
    # The name tf.get_default_session is deprecated.
    # Please use tf.compat.v1.get_default_session instead.
    train_model = specific_builder.build_model(False)
    # Cannot fix these Keras internals via "import tensorflow.compat.v1 as tf"
    #
    # TODO(KGF): note, these are different than C-based info diagnostics e.g.:
    # 2019-11-06 18:27:31.698908: I ...  dynamic library libcublas.so.10
    # which are NOT suppressed by set_verbosity. See top level __init__.py

    # load the latest epoch we did. Returns 0 if none exist yet
    e = specific_builder.load_model_weights(train_model)
    e_old = e

    num_epochs = conf['training']['num_epochs']
    lr_decay = conf['model']['lr_decay']
    batch_size = conf['training']['batch_size']
    lr = conf['model']['lr']
    clipnorm = conf['model']['clipnorm']
    warmup_steps = conf['model']['warmup_steps']
    # TODO(KGF): rename as "num_iter_minimum" or "min_steps_per_epoch"
    num_batches_minimum = conf['training']['num_batches_minimum']

    if 'adam' in conf['model']['optimizer']:
        optimizer = MPIAdam(lr=lr)
    elif (conf['model']['optimizer'] == 'sgd'
          or conf['model']['optimizer'] == 'tf_sgd'):
        optimizer = MPISGD(lr=lr)
    elif 'momentum_sgd' in conf['model']['optimizer']:
        optimizer = MPIMomentumSGD(lr=lr)
    else:
        print("Optimizer not implemented yet")
        exit(1)

    g.print_unique('{} epoch(s) left to go'.format(num_epochs - e))

    batch_generator = partial(loader.training_batch_generator_partial_reset,
                              shot_list=shot_list_train)

    g.print_unique("warmup steps = {}".format(warmup_steps))
    mpi_model = MPIModel(train_model,
                         optimizer,
                         g.comm,
                         batch_generator,
                         batch_size,
                         lr=lr,
                         warmup_steps=warmup_steps,
                         num_batches_minimum=num_batches_minimum,
                         conf=conf)
    mpi_model.compile(conf['model']['optimizer'], clipnorm,
                      conf['data']['target'].loss)
    tensorboard = None
    if g.task_index == 0:
        tensorboard_save_path = conf['paths']['tensorboard_save_path']
        write_grads = conf['callbacks']['write_grads']
        tensorboard = TensorBoard(log_dir=tensorboard_save_path,
                                  histogram_freq=1,
                                  write_graph=True,
                                  write_grads=write_grads)
        tensorboard.set_model(mpi_model.model)
        # TODO(KGF): check addition of TF model summary write added from fork
        fr = open('model_architecture.log', 'a')
        ori = sys.stdout
        sys.stdout = fr
        mpi_model.model.summary()
        sys.stdout = ori
        fr.close()
        mpi_model.model.summary()

    if g.task_index == 0:
        callbacks = mpi_model.build_callbacks(conf, callbacks_list)
        callbacks.set_model(mpi_model.model)
        callback_metrics = conf['callbacks']['metrics']
        callbacks.set_params({
            'epochs': num_epochs,
            'metrics': callback_metrics,
            'batch_size': batch_size,
        })
        callbacks.on_train_begin()
    if conf['callbacks']['mode'] == 'max':
        best_so_far = -np.inf
        cmp_fn = max
    else:
        best_so_far = np.inf
        cmp_fn = min

    while e < num_epochs:
        g.write_unique('\nBegin training from epoch {:.2f}/{}'.format(
            e, num_epochs))
        if g.task_index == 0:
            callbacks.on_epoch_begin(int(round(e)))
        mpi_model.set_lr(lr * lr_decay**e)

        # KGF: core work of loop performed in next line
        (step, ave_loss, curr_loss, num_so_far,
         effective_epochs) = mpi_model.train_epoch()
        e = e_old + effective_epochs
        g.write_unique('Finished training of epoch {:.2f}/{}\n'.format(
            e, num_epochs))

        # TODO(KGF): add diagnostic about "saving to epoch X"?
        loader.verbose = False  # True during the first iteration
        if g.task_index == 0:
            specific_builder.save_model_weights(train_model, int(round(e)))

        if conf['training']['no_validation']:
            break

        epoch_logs = {}
        g.write_unique('Begin evaluation of epoch {:.2f}/{}\n'.format(
            e, num_epochs))
        # TODO(KGF): flush output/ MPI barrier?
        # g.flush_all_inorder()

        # TODO(KGF): is there a way to avoid Keras.Models.load_weights()
        # repeated calls throughout mpi_make_pred*() fn calls?
        _, _, _, roc_area, loss = mpi_make_predictions_and_evaluate(
            conf, shot_list_validate, loader)

        if conf['training']['ranking_difficulty_fac'] != 1.0:
            (_, _, _, roc_area_train,
             loss_train) = mpi_make_predictions_and_evaluate(
                 conf, shot_list_train, loader)
            batch_generator = partial(
                loader.training_batch_generator_partial_reset,
                shot_list=shot_list_train)
            mpi_model.batch_iterator = batch_generator
            mpi_model.batch_iterator_func.__exit__()
            mpi_model.num_so_far_accum = mpi_model.num_so_far_indiv
            mpi_model.set_batch_iterator_func()

        if ('monitor_test' in conf['callbacks'].keys()
                and conf['callbacks']['monitor_test']):
            times = conf['callbacks']['monitor_times']
            areas, _ = mpi_make_predictions_and_evaluate_multiple_times(
                conf, shot_list_validate, loader, times)
            epoch_str = 'epoch {}, '.format(int(round(e)))
            g.write_unique(epoch_str + ' '.join([
                'val_roc_{} = {}'.format(t, roc)
                for t, roc in zip(times, areas)
            ]) + '\n')
            if shot_list_test is not None:
                areas, _ = mpi_make_predictions_and_evaluate_multiple_times(
                    conf, shot_list_test, loader, times)
                g.write_unique(epoch_str + ' '.join([
                    'test_roc_{} = {}'.format(t, roc)
                    for t, roc in zip(times, areas)
                ]) + '\n')

        epoch_logs['val_roc'] = roc_area
        epoch_logs['val_loss'] = loss
        epoch_logs['train_loss'] = ave_loss
        best_so_far = cmp_fn(epoch_logs[conf['callbacks']['monitor']],
                             best_so_far)
        stop_training = False
        g.flush_all_inorder()
        if g.task_index == 0:
            print('=========Summary======== for epoch {:.2f}'.format(e))
            print('Training Loss numpy: {:.3e}'.format(ave_loss))
            print('Validation Loss: {:.3e}'.format(loss))
            print('Validation ROC: {:.4f}'.format(roc_area))
            if conf['training']['ranking_difficulty_fac'] != 1.0:
                print('Training Loss: {:.3e}'.format(loss_train))
                print('Training ROC: {:.4f}'.format(roc_area_train))
            print('======================== ')
            callbacks.on_epoch_end(int(round(e)), epoch_logs)
            if hasattr(mpi_model.model, 'stop_training'):
                stop_training = mpi_model.model.stop_training
            # only save model weights if quantity we are tracking is improving
            if best_so_far != epoch_logs[conf['callbacks']['monitor']]:
                if ('monitor_test' in conf['callbacks'].keys()
                        and conf['callbacks']['monitor_test']):
                    print("No improvement, saving model weights anyways")
                else:
                    print("Not saving model weights")
                    specific_builder.delete_model_weights(
                        train_model, int(round(e)))

            # tensorboard
            val_generator = partial(loader.training_batch_generator,
                                    shot_list=shot_list_validate)()
            val_steps = 1
            tensorboard.on_epoch_end(val_generator, val_steps, int(round(e)),
                                     epoch_logs)
        stop_training = g.comm.bcast(stop_training, root=0)
        g.write_unique('Finished evaluation of epoch {:.2f}/{}'.format(
            e, num_epochs))
        # TODO(KGF): compare to old diagnostic:
        # g.write_unique("end epoch {}".format(e_old))
        if stop_training:
            g.write_unique("Stopping training due to early stopping")
            break

    if g.task_index == 0:
        callbacks.on_train_end()
        tensorboard.on_train_end()

    mpi_model.close()
Esempio n. 2
0
loader = Loader(conf, normalizer)
g.print_unique("...done")

# TODO(KGF): both preprocess.py and normalize.py are littered with print()
# calls that should probably be replaced with print_unique() when they are not
# purely loading previously-computed quantities from file
# (or we can continue to ensure that they are only ever executed by 1 rank)

#####################################################
#                    TRAINING                       #
#####################################################

# Prevent Keras TF backend deprecation messages from mpi_train() from
# appearing jumbled with stdout, stderr msgs from above steps
g.comm.Barrier()
g.flush_all_inorder()

# reminder: ensure training has a separate random seed for every worker
if not only_predict:
    mpi_train(conf,
              shot_list_train,
              shot_list_validate,
              loader,
              shot_list_test=shot_list_test)
g.flush_all_inorder()

#####################################################
#                    TESTING                        #
#####################################################

# load last model for testing
Esempio n. 3
0
from copy import deepcopy
# import socket
sys.setrecursionlimit(10000)

# TODO(KGF): remove the next 3 lines?
# import keras sequentially because it otherwise reads from ~/.keras/keras.json
# with too many threads:
# from mpi_launch_tensorflow import get_mpi_task_index

# set global variables for entire module regarding MPI & GPU environment
g.init_GPU_backend(conf)
# moved this fn/init call to client-facing mpi_learn.py
# g.init_MPI()
# TODO(KGF): set "mpi_initialized" global bool flag?

g.flush_all_inorder()  # see above about conf_parser.py stdout writes

# initialization code for mpi_runner.py module:
if g.backend == 'tf' or g.backend == 'tensorflow':
    if g.NUM_GPUS > 1:
        os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(g.MY_GPU)
        # ,mode=NanGuardMode'
    os.environ['KERAS_BACKEND'] = 'tensorflow'  # default setting
    g.tf_ver = parse_version(get_distribution('tensorflow').version)
    # compat/compat.py first committed on 2018-06-29 for Py 2 vs 3
    # (around, but not present in, the release of v1.9.0)
    # v2 compatiblity code added, then moved from compat.py in Nov and Dec 2018
    # compat.v1 first mentioned in RELEASE.md in v1.13.0.
    # But many TF deprecation warnings in 1.14.0, e.g.:
    # "The name tf.GPUOptions is deprecated. Please use tf.compat.v1.GPUOptions
    # instead". See tf_export.py