def train_schedule(estimator, train_eval_iterations, single_iteration_train_steps=None, single_iteration_train_epochs=None, bleu_source=None, bleu_ref=None, bleu_threshold=None): """Train and evaluate model, and optionally compute model's BLEU score. **Step vs. Epoch vs. Iteration** Steps and epochs are canonical terms used in TensorFlow and general machine learning. They are used to describe running a single process (train/eval): - Step refers to running the process through a single or batch of examples. - Epoch refers to running the process through an entire dataset. E.g. training a dataset with 100 examples. The dataset is divided into 20 batches with 5 examples per batch. A single training step trains the model on one batch. After 20 training steps, the model will have trained on every batch in the dataset, or, in other words, one epoch. Meanwhile, iteration is used in this implementation to describe running multiple processes (training and eval). - A single iteration: 1. trains the model for a specific number of steps or epochs. 2. evaluates the model. 3. (if source and ref files are provided) compute BLEU score. This function runs through multiple train+eval+bleu iterations. Args: estimator: tf.Estimator containing model to train. train_eval_iterations: Number of times to repeat the train+eval iteration. single_iteration_train_steps: Number of steps to train in one iteration. single_iteration_train_epochs: Number of epochs to train in one iteration. bleu_source: File containing text to be translated for BLEU calculation. bleu_ref: File containing reference translations for BLEU calculation. bleu_threshold: minimum BLEU score before training is stopped. Raises: ValueError: if both or none of single_iteration_train_steps and single_iteration_train_epochs were defined. """ # Ensure that exactly one of single_iteration_train_steps and # single_iteration_train_epochs is defined. if single_iteration_train_steps is None: if single_iteration_train_epochs is None: raise ValueError( "Exactly one of single_iteration_train_steps or " "single_iteration_train_epochs must be defined. Both were none." ) else: if single_iteration_train_epochs is not None: raise ValueError( "Exactly one of single_iteration_train_steps or " "single_iteration_train_epochs must be defined. Both were defined." ) evaluate_bleu = bleu_source is not None and bleu_ref is not None # Print out training schedule print("Training schedule:") if single_iteration_train_epochs is not None: print("\t1. Train for %d epochs." % single_iteration_train_epochs) else: print("\t1. Train for %d steps." % single_iteration_train_steps) print("\t2. Evaluate model.") if evaluate_bleu: print("\t3. Compute BLEU score.") if bleu_threshold is not None: print("Repeat above steps until the BLEU score reaches", bleu_threshold) if not evaluate_bleu or bleu_threshold is None: print("Repeat above steps %d times." % train_eval_iterations) if evaluate_bleu: # Set summary writer to log bleu score. bleu_writer = tf.compat.v1.summary.FileWriter( os.path.join(estimator.model_dir, BLEU_DIR)) if bleu_threshold is not None: # Change loop stopping condition if bleu_threshold is defined. train_eval_iterations = INF # Loop training/evaluation/bleu cycles mlperf_log.transformer_print(key=mlperf_log.TRAIN_LOOP) #Creating hooks for printing Examples per Second, used with estimator.train train_hooks = hooks_helper.get_train_hooks( ["ExamplesPerSecondHook"], model_dir=FLAGS.model_dir, batch_size=estimator.params.batch_size, every_n_steps=FLAGS.print_iter, warm_steps=20) for i in xrange(train_eval_iterations): print("Starting iteration", i + 1) if single_iteration_train_epochs is not None: mlperf_log.transformer_print( key=mlperf_log.TRAIN_EPOCH, value=i * single_iteration_train_epochs + 1) # Train the model for single_iteration_train_steps or until the input fn # runs out of examples (if single_iteration_train_steps is None). estimator.train(dataset.train_input_fn, steps=single_iteration_train_steps, hooks=train_hooks) mlperf_log.transformer_print(key=mlperf_log.EVAL_START) # To save training time, we can turn off evaluation # Otherwise it will be turned on if FLAGS.do_eval == "Yes": eval_results = estimator.evaluate(dataset.eval_input_fn) print( "Evaluation results (iter %d/%d):" % (i + 1, train_eval_iterations), eval_results) if evaluate_bleu: uncased_score, _ = evaluate_and_log_bleu(estimator, bleu_writer, bleu_source, bleu_ref) if bleu_threshold is not None and uncased_score > bleu_threshold: bleu_writer.close() break mlperf_log.transformer_print(key=mlperf_log.EVAL_TARGET, value=bleu_threshold) mlperf_log.transformer_print(key=mlperf_log.EVAL_ACCURACY, value=uncased_score) mlperf_log.transformer_print(key=mlperf_log.EVAL_STOP)
def resnet_main(seed, flags, model_function, input_function, shape=None): """Shared main loop for ResNet Models. Args: flags: FLAGS object that contains the params for running. See ResnetArgParser for created flags. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. shape: list of ints representing the shape of the images used for training. This is only used if flags.export_dir is passed. """ # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.compat.v1.ConfigProto( inter_op_parallelism_threads=flags.inter_op_parallelism_threads, intra_op_parallelism_threads=flags.intra_op_parallelism_threads, allow_soft_placement=True) if flags.num_gpus == 0: distribution = tf.distribute.OneDeviceStrategy('device:CPU:0') elif flags.num_gpus == 1: distribution = tf.distribute.OneDeviceStrategy('device:GPU:0') else: distribution = tf.distribute.MirroredStrategy( num_gpus=flags.num_gpus ) mllogger.event(key=mllog.constants.SEED, value=seed) run_config = tf.estimator.RunConfig(train_distribute=distribution, session_config=session_config, log_step_count_steps=20, # output logs more frequently save_checkpoints_steps=2502, keep_checkpoint_max=1, tf_random_seed=seed) mllogger.event(key=mllog.constants.GLOBAL_BATCH_SIZE, value=flags.batch_size*hvd.size()) if is_mpi: if hvd.rank() == 0: model_dir = os.path.join(flags.model_dir,"main") else: model_dir = os.path.join(flags.model_dir,"tmp{}".format(hvd.rank())) benchmark_log_dir = flags.benchmark_log_dir if hvd.rank() == 0 else None else: model_dir = flags.model_dir benchmark_log_dir = flags.benchmark_log_dir classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=model_dir, config=run_config, params={ 'resnet_size': flags.resnet_size, 'data_format': flags.data_format, 'batch_size': flags.batch_size, 'version': flags.version, 'loss_scale': flags.loss_scale, 'dtype': flags.dtype, 'label_smoothing': flags.label_smoothing, 'enable_lars': flags.enable_lars, 'weight_decay': flags.weight_decay, 'fine_tune': flags.fine_tune, 'use_bfloat16': flags.use_bfloat16 }) if benchmark_log_dir is not None: benchmark_logger = logger.BenchmarkLogger(benchmark_log_dir) benchmark_logger.log_run_info('resnet') else: benchmark_logger = None # for MPI only to figure out the steps per epoch or per eval, per worker if is_mpi: num_eval_steps = _NUM_IMAGES['validation'] // flags.batch_size steps_per_epoch = _NUM_IMAGES['train'] // flags.batch_size steps_per_epoch_per_worker = steps_per_epoch // hvd.size() steps_per_eval_per_worker = steps_per_epoch_per_worker * flags.epochs_between_evals # The reference performs the first evaluation on the fourth epoch. (offset # eval by 3 epochs) success = False for i in range(flags.train_epochs // flags.epochs_between_evals): # Data for epochs_between_evals (i.e. 4 epochs between evals) worth of # epochs is concatenated and run as a single block inside a session. For # this reason we declare all of the epochs that will be run at the start. # Submitters may report in a way which is reasonable for their control flow. mllogger.start(key=mllog.constants.BLOCK_START, value=i+1) mllogger.event(key=mllog.constants.FIRST_EPOCH_NUM, value=i*flags.epochs_between_evals) mllogger.event(key=mllog.constants.EPOCH_COUNT, value=flags.epochs_between_evals) for j in range(flags.epochs_between_evals): mllogger.event(key=mllog.constants.EPOCH_NUM, value=i * flags.epochs_between_evals + j) flags.hooks += ["examplespersecondhook"] if is_mpi: train_hooks = [hvd.BroadcastGlobalVariablesHook(0)] train_hooks = train_hooks + hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size*hvd.size(), benchmark_log_dir=flags.benchmark_log_dir) else: train_hooks = hooks_helper.get_train_hooks( flags.hooks, batch_size=flags.batch_size, benchmark_log_dir=flags.benchmark_log_dir) _log_cache = [] def formatter(x): """Abuse side effects to get tensors out of the model_fn.""" if _log_cache: _log_cache.pop() _log_cache.append(x.copy()) return str(x) compliance_hook = tf.estimator.LoggingTensorHook( tensors={_NUM_EXAMPLES_NAME: _NUM_EXAMPLES_NAME}, every_n_iter=int(1e10), at_end=True, formatter=formatter) print('Starting a training cycle.') def input_fn_train(): return input_function( is_training=True, data_dir=flags.data_dir, batch_size=per_device_batch_size(flags.batch_size, flags.num_gpus), num_epochs=flags.epochs_between_evals, num_gpus=flags.num_gpus, dtype=flags.dtype ) if is_mpi: # if max step is set, use max_step, not the steps_per_eval_per_worker # assuming max_train_steps is smaller than steps_per_eval_per_worker # Also assuming when -- steps is specified, the train epochs should # be set to be equal to epochs_between_evals so that the # range(flags.train_epochs // flags.epochs_between_evals) gets to be 1 if (flags.max_train_steps) and (flags.max_train_steps < steps_per_eval_per_worker): train_steps = flags.max_train_steps else: train_steps = steps_per_eval_per_worker classifier.train(input_fn=input_fn_train, hooks=train_hooks + [compliance_hook], steps=train_steps) else: classifier.train(input_fn=input_fn_train, hooks=train_hooks + [compliance_hook], max_steps=flags.max_train_steps) #train_examples = int(_log_cache.pop()[_NUM_EXAMPLES_NAME]) #mlperf_log.resnet_print(key=mlperf_log.INPUT_SIZE, value=train_examples) mllogger.end(key=mllog.constants.BLOCK_STOP, value=i+1) print('Starting to evaluate.') # Evaluate the model and print results def input_fn_eval(): return input_function( is_training=False, data_dir=flags.data_dir, batch_size=per_device_batch_size(flags.batch_size, flags.num_gpus), num_epochs=1, dtype=flags.dtype ) mllogger.start(key=mllog.constants.EVAL_START) # flags.max_train_steps is generally associated with testing and profiling. # As a result it is frequently called with synthetic data, which will # iterate forever. Passing steps=flags.max_train_steps allows the eval # (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags.max_train_steps) mllogger.event(key=mllog.constants.EVAL_SAMPLES, value=int(eval_results[_NUM_EXAMPLES_NAME])) mllogger.event(key=mllog.constants.EVAL_ACCURACY, value=float(eval_results['accuracy'])) mllogger.end(key=mllog.constants.EVAL_STOP) print(eval_results) if benchmark_logger: benchmark_logger.log_estimator_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags.stop_threshold, eval_results['accuracy']): success = True break mllogger.event(key=mllog.constants.RUN_STOP, value={"success": success}) mllogger.end(key=mllog.constants.RUN_STOP)