def get_callbacks(model_checkpoint: bool = True, include_tensorboard: bool = True, time_history: bool = True, track_lr: bool = True, write_model_weights: bool = True, initial_step: int = 0, batch_size: int = 0, log_steps: int = 0, model_dir: str = None) -> List[tf.keras.callbacks.Callback]: """Get all callbacks.""" model_dir = model_dir or '' callbacks = [] if model_checkpoint: ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}') callbacks.append( tf.keras.callbacks.ModelCheckpoint(ckpt_full_path, save_weights_only=True, verbose=1)) if include_tensorboard: callbacks.append( CustomTensorBoard(log_dir=model_dir, track_lr=track_lr, initial_step=initial_step, write_images=write_model_weights)) if time_history: callbacks.append( keras_utils.TimeHistory( batch_size, log_steps, logdir=model_dir if include_tensorboard else None)) return callbacks
def get_callbacks(steps_per_epoch, current_rank, cluster_size, learning_rate_schedule_fn): """Returns common callbacks.""" time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps) callbacks = [time_callback] if not FLAGS.use_tensor_lr and learning_rate_schedule_fn: lr_callback = LearningRateBatchScheduler( learning_rate_schedule_fn, batch_size=FLAGS.batch_size, steps_per_epoch=steps_per_epoch, cluster_size=cluster_size) callbacks.append(lr_callback) if FLAGS.enable_tensorboard and current_rank == 0: tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=FLAGS.model_dir) callbacks.append(tensorboard_callback) if FLAGS.profile_steps: profiler_callback = keras_utils.get_profiler_callback( FLAGS.model_dir, FLAGS.profile_steps, FLAGS.enable_tensorboard, steps_per_epoch) callbacks.append(profiler_callback) return callbacks
def _run_and_report_benchmark(self, params, min_ap=0.325, max_ap=0.35, do_eval=True, warmup=1): """Starts RetinaNet accuracy benchmark test.""" FLAGS.params_override = json.dumps(params) # Need timer callback to measure performance self.timer_callback = keras_utils.TimeHistory( batch_size=params['train']['batch_size'], log_steps=FLAGS.log_steps, ) start_time_sec = time.time() FLAGS.mode = 'train' summary, _ = self._run_detection_main() wall_time_sec = time.time() - start_time_sec if do_eval: FLAGS.mode = 'eval' eval_metrics = self._run_detection_main() summary.update(eval_metrics) summary['total_steps'] = params['train']['total_steps'] self._report_benchmark(summary, start_time_sec, wall_time_sec, min_ap, max_ap, warmup)
def get_callbacks(pruning_method=None, enable_checkpoint_and_export=False, model_dir=None): """Returns common callbacks.""" time_callback = keras_utils.TimeHistory( FLAGS.batch_size, FLAGS.log_steps, logdir=FLAGS.model_dir if FLAGS.enable_tensorboard else None) callbacks = [time_callback] if FLAGS.enable_tensorboard: tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=FLAGS.model_dir, profile_batch=FLAGS.profile_steps) callbacks.append(tensorboard_callback) is_pruning_enabled = pruning_method is not None if is_pruning_enabled: callbacks.append(tfmot.sparsity.keras.UpdatePruningStep()) if model_dir is not None: callbacks.append( tfmot.sparsity.keras.PruningSummaries(log_dir=model_dir, profile_batch=0)) if enable_checkpoint_and_export: if model_dir is not None: ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}') callbacks.append( tf.keras.callbacks.ModelCheckpoint(ckpt_full_path, save_weights_only=True)) return callbacks
def get_callbacks(learning_rate_schedule_fn, num_images): """Returns common callbacks.""" time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps) callbacks = [time_callback] if not FLAGS.use_tensor_lr: lr_callback = LearningRateBatchScheduler( learning_rate_schedule_fn, batch_size=FLAGS.batch_size, num_images=num_images) callbacks.append(lr_callback) if FLAGS.enable_tensorboard: tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=FLAGS.model_dir) callbacks.append(tensorboard_callback) if FLAGS.profile_steps: profiler_callback = keras_utils.get_profiler_callback( FLAGS.model_dir, FLAGS.profile_steps, FLAGS.enable_tensorboard) callbacks.append(profiler_callback) return callbacks
def run_ncf(_): """Run NCF training and eval with Keras.""" # TODO(seemuch): Support different train and eval batch sizes if FLAGS.eval_batch_size != FLAGS.batch_size: tf.logging.warning( "The Keras implementation of NCF currently does not support batch_size " "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match " "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size)) FLAGS.eval_batch_size = FLAGS.batch_size params = ncf_common.parse_flags(FLAGS) batch_size = params["batch_size"] # ncf_common rounds eval_batch_size (this is needed due to a reshape during # eval). This carries over that rounding to batch_size as well. params['batch_size'] = params['eval_batch_size'] num_users, num_items, num_train_steps, num_eval_steps, producer = ( ncf_common.get_inputs(params)) params["num_users"], params["num_items"] = num_users, num_items producer.start() model_helpers.apply_clean(flags.FLAGS) batches_per_step = params["batches_per_step"] train_input_dataset, eval_input_dataset = _get_train_and_eval_data( producer, params) # It is required that for distributed training, the dataset must call # batch(). The parameter of batch() here is the number of replicas involed, # such that each replica evenly gets a slice of data. train_input_dataset = train_input_dataset.batch(batches_per_step) eval_input_dataset = eval_input_dataset.batch(batches_per_step) strategy = ncf_common.get_distribution_strategy(params) with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = ncf_common.get_optimizer(params) time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) keras_model.compile(loss=_keras_loss, metrics=[_get_metric_fn(params)], optimizer=optimizer) history = keras_model.fit( train_input_dataset, epochs=FLAGS.train_epochs, callbacks=[IncrementEpochCallback(producer), time_callback], verbose=2) tf.logging.info("Training done. Start evaluating") eval_results = keras_model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) tf.logging.info("Keras evaluation is done.") stats = build_stats(history, eval_results, time_callback) return stats
def main(_): # Users should always run this script under TF 2.x with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) if FLAGS.mode == 'export_only': export_squad(FLAGS.model_export_path, input_meta_data) return # Configures cluster spec for multi-worker distribution strategy. if FLAGS.num_gpus > 0: _ = distribution_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, all_reduce_alg=FLAGS.all_reduce_alg, tpu_address=FLAGS.tpu) if 'train' in FLAGS.mode: if FLAGS.log_steps: custom_callbacks = [ keras_utils.TimeHistory( batch_size=FLAGS.train_batch_size, log_steps=FLAGS.log_steps, logdir=FLAGS.model_dir, ) ] else: custom_callbacks = None train_squad( strategy, input_meta_data, custom_callbacks=custom_callbacks, run_eagerly=FLAGS.run_eagerly, ) if 'predict' in FLAGS.mode: predict_squad(strategy, input_meta_data) if 'eval' in FLAGS.mode: if input_meta_data.get('version_2_with_negative', False): logging.error('SQuAD v2 eval is not supported. ' 'Falling back to predict mode.') predict_squad(strategy, input_meta_data) else: eval_metrics = eval_squad(strategy, input_meta_data) f1_score = eval_metrics['f1'] logging.info('SQuAD eval F1-score: %f', f1_score) if (not strategy) or strategy.extended.should_save_summary: summary_dir = os.path.join(FLAGS.model_dir, 'summaries') else: summary_dir = tempfile.mkdtemp() summary_writer = tf.summary.create_file_writer( os.path.join(summary_dir, 'eval')) with summary_writer.as_default(): # TODO(lehou): write to the correct step number. tf.summary.scalar('F1-score', f1_score, step=0) summary_writer.flush()
def run_bert(strategy, input_meta_data, model_config, train_input_fn=None, eval_input_fn=None, init_checkpoint=None, custom_callbacks=None, custom_metrics=None): """Run BERT training.""" # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_session_config(FLAGS.enable_xla) performance.set_mixed_precision_policy(common_flags.dtype(), use_experimental_api=False) epochs = FLAGS.num_train_epochs * FLAGS.num_eval_per_epoch train_data_size = ( input_meta_data['train_data_size'] // FLAGS.num_eval_per_epoch) if FLAGS.train_data_size: train_data_size = min(train_data_size, FLAGS.train_data_size) logging.info('Updated train_data_size: %s', train_data_size) steps_per_epoch = int(train_data_size / FLAGS.train_batch_size) warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size) eval_steps = int( math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size)) if not strategy: raise ValueError('Distribution strategy has not been specified.') if not custom_callbacks: custom_callbacks = [] if FLAGS.log_steps: custom_callbacks.append( keras_utils.TimeHistory( batch_size=FLAGS.train_batch_size, log_steps=FLAGS.log_steps, logdir=FLAGS.model_dir)) trained_model, _ = run_bert_classifier( strategy, model_config, input_meta_data, FLAGS.model_dir, epochs, steps_per_epoch, FLAGS.steps_per_loop, eval_steps, warmup_steps, FLAGS.learning_rate, init_checkpoint or FLAGS.init_checkpoint, train_input_fn, eval_input_fn, custom_callbacks=custom_callbacks, custom_metrics=custom_metrics) if FLAGS.model_export_path: model_saving_utils.export_bert_model( FLAGS.model_export_path, model=trained_model) return trained_model
def train_and_eval(self): """Trains the model.""" lr_schedule = optimizer.LearningRateSchedule(self.params["learning_rate"], self.params["hidden_size"], self.params["learning_rate_warmup_steps"]) opt = tf.keras.optimizers.Adam(lr_schedule, self.params["optimizer_adam_beta1"], self.params["optimizer_adam_beta2"], epsilon=self.params["optimizer_adam_epsilon"]) self.train_model.compile(opt) self.train_model.summary() # create train dataset train_ds = data_pipeline.train_input_fn(self.params, shuffle_seed = 42, num_ranks = tnt.get_size(), rank = tnt.get_rank()) # enable global callbacks callbacks = [] if self.flags_obj.enable_tensorboard and self.flags_obj.model_dir: callbacks.append(tf.keras.callbacks.TensorBoard(log_dir=self.flags_obj.model_dir)) # enable logging callbacks only on the master rank if self.flags_obj.enable_time_history: time_callback = keras_utils.TimeHistory(self.params["batch_size"], self.params["num_sentences"], logdir = None) tnt_time_callback = tnt.keras.callbacks.Callback(time_callback, aggregate_logs = False, run_on_all_ranks = False) callbacks.append(tnt_time_callback) # print messages only once if tnt.is_master_rank(): logging.info("Start train") stats = {} for epoch in range(0, self.params["train_epochs"], self.params["epochs_between_evals"]): # as our dataset is distributed manually, disable the automatic Tarantella distribution history = self.train_model.fit(train_ds, callbacks = callbacks, tnt_distribute_dataset = False, initial_epoch = epoch, epochs = epoch + min(self.params["epochs_between_evals"], self.params["train_epochs"]-epoch), verbose = 2) if tnt.is_master_rank(): logging.info("Train history: {}".format(history.history)) stats = misc.build_stats(history, callbacks) if tnt.is_master_rank(): eval_stats = self.eval() stats.update(eval_stats) return stats
def main(_): gin.parse_config_files_and_bindings(FLAGS.gin_file, FLAGS.gin_param) with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) if FLAGS.mode == 'export_only': export_squad(FLAGS.model_export_path, input_meta_data) return # Configures cluster spec for multi-worker distribution strategy. if FLAGS.num_gpus > 0: _ = distribute_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index) strategy = distribute_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, all_reduce_alg=FLAGS.all_reduce_alg, tpu_address=FLAGS.tpu) if 'train' in FLAGS.mode: if FLAGS.log_steps: custom_callbacks = [ keras_utils.TimeHistory( batch_size=FLAGS.train_batch_size, log_steps=FLAGS.log_steps, logdir=FLAGS.model_dir, ) ] else: custom_callbacks = None train_squad( strategy, input_meta_data, custom_callbacks=custom_callbacks, run_eagerly=FLAGS.run_eagerly, sub_model_export_name=FLAGS.sub_model_export_name, ) if 'predict' in FLAGS.mode: predict_squad(strategy, input_meta_data) if 'eval' in FLAGS.mode: eval_metrics = eval_squad(strategy, input_meta_data) f1_score = eval_metrics['final_f1'] logging.info('SQuAD eval F1-score: %f', f1_score) summary_dir = os.path.join(FLAGS.model_dir, 'summaries', 'eval') summary_writer = tf.summary.create_file_writer(summary_dir) with summary_writer.as_default(): # TODO(lehou): write to the correct step number. tf.summary.scalar('F1-score', f1_score, step=0) summary_writer.flush() # Also write eval_metrics to json file. squad_lib_wp.write_to_json_files( eval_metrics, os.path.join(summary_dir, 'eval_metrics.json')) time.sleep(60)
def get_callbacks(learning_rate_schedule_fn, num_images): """Returns common callbacks.""" time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps) tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=FLAGS.model_dir) lr_callback = LearningRateBatchScheduler(learning_rate_schedule_fn, batch_size=FLAGS.batch_size, num_images=num_images) return time_callback, tensorboard_callback, lr_callback
def train_model(flags_obj, dataset, vocab_size, strategy, checkpoint_dir=None): """Trains a Shakespeare model. Args: flags_obj: An object containing parsed flag values.s dataset: the training data set. vocab_size: the number of unique character classes. strategy: distribution strategy to use. checkpoint_dir: if not None, the directory in which to make checkpoints. Returns: The training history and callbacks. """ if flags_obj.train_steps: train_steps = flags_obj.train_steps else: train_steps = BATCHES_PER_EPOCH // flags_obj.batch_size strategy_scope = distribution_utils.get_strategy_scope(strategy) with strategy_scope: model = build_model(vocab_size=vocab_size, batch_size=flags_obj.batch_size, use_cudnn=flags_obj.cudnn) # When keras_use_ctl is False, Model.fit() automatically applies # loss scaling so we don't need to create a LossScaleOptimizer. model.compile( optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[ tf.keras.metrics.Recall(top_k=1, name='RecallAt1'), tf.keras.metrics.Recall(top_k=5, name='RecallAt5') ], run_eagerly=flags_obj.run_eagerly, experimental_run_tf_function=flags_obj.force_v2_in_keras_compile) callbacks = [] if checkpoint_dir: checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt_{epoch}') checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_prefix, save_weights_only=True) callbacks.append(checkpoint_callback) time_callback = keras_utils.TimeHistory(flags_obj.batch_size, flags_obj.log_steps) callbacks.append(time_callback) history = model.fit(dataset, epochs=flags_obj.train_epochs, steps_per_epoch=train_steps, callbacks=callbacks, verbose=2) return history, callbacks
def get_callbacks( model_checkpoint: bool = True, include_tensorboard: bool = True, time_history: bool = True, track_lr: bool = True, write_model_weights: bool = True, apply_moving_average: bool = False, initial_step: int = 0, batch_size: int = 0, log_steps: int = 0, model_dir: Optional[str] = None, backup_and_restore: bool = False) -> List[tf.keras.callbacks.Callback]: """Get all callbacks.""" model_dir = model_dir or '' callbacks = [] if model_checkpoint: ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}') callbacks.append( tf.keras.callbacks.ModelCheckpoint( ckpt_full_path, save_weights_only=True, verbose=1)) if backup_and_restore: backup_dir = os.path.join(model_dir, 'tmp') callbacks.append( tf.keras.callbacks.experimental.BackupAndRestore(backup_dir)) if include_tensorboard: callbacks.append( CustomTensorBoard( log_dir=model_dir, track_lr=track_lr, initial_step=initial_step, write_images=write_model_weights, profile_batch=0)) if time_history: callbacks.append( keras_utils.TimeHistory( batch_size, log_steps, logdir=model_dir if include_tensorboard else None)) if apply_moving_average: # Save moving average model to a different file so that # we can resume training from a checkpoint ckpt_full_path = os.path.join(model_dir, 'average', 'model.ckpt-{epoch:04d}') callbacks.append( AverageModelCheckpoint( update_weights=False, filepath=ckpt_full_path, save_weights_only=True, verbose=1)) callbacks.append(MovingAverageCallback()) return callbacks
def get_callbacks(): """Returns common callbacks.""" callbacks = [] if FLAGS.enable_time_history: time_callback = keras_utils.TimeHistory( FLAGS.batch_size, FLAGS.log_steps, FLAGS.model_dir if FLAGS.enable_tensorboard else None) callbacks.append(time_callback) if FLAGS.enable_tensorboard: tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=FLAGS.model_dir) callbacks.append(tensorboard_callback) return callbacks
def get_callbacks(steps_per_epoch, learning_rate_schedule_fn=None, pruning_method=None, enable_checkpoint_and_export=False, model_dir=None): """Returns common callbacks.""" time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps) callbacks = [time_callback] if not FLAGS.use_tensor_lr and learning_rate_schedule_fn: lr_callback = LearningRateBatchScheduler( learning_rate_schedule_fn, batch_size=FLAGS.batch_size, steps_per_epoch=steps_per_epoch) callbacks.append(lr_callback) if FLAGS.enable_tensorboard: tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=FLAGS.model_dir) callbacks.append(tensorboard_callback) if FLAGS.profile_steps: profiler_callback = keras_utils.get_profiler_callback( FLAGS.model_dir, FLAGS.profile_steps, FLAGS.enable_tensorboard, steps_per_epoch) callbacks.append(profiler_callback) is_pruning_enabled = pruning_method is not None if is_pruning_enabled: callbacks.append(tfmot.sparsity.keras.UpdatePruningStep()) if model_dir is not None: callbacks.append( tfmot.sparsity.keras.PruningSummaries(log_dir=model_dir, profile_batch=0)) if enable_checkpoint_and_export: if model_dir is not None: ckpt_full_path = os.path.join(model_dir, 'model.ckpt-{epoch:04d}') callbacks.append( tf.keras.callbacks.ModelCheckpoint(ckpt_full_path, save_weights_only=True)) return callbacks
def get_callbacks(steps_per_epoch): """Returns common callbacks.""" callbacks = [] if FLAGS.enable_time_history: time_callback = keras_utils.TimeHistory(FLAGS.batch_size, FLAGS.log_steps) callbacks.append(time_callback) if FLAGS.enable_tensorboard: tensorboard_callback = tf.keras.callbacks.TensorBoard( log_dir=FLAGS.model_dir) callbacks.append(tensorboard_callback) if FLAGS.profile_steps: profiler_callback = keras_utils.get_profiler_callback( FLAGS.model_dir, FLAGS.profile_steps, FLAGS.enable_tensorboard, steps_per_epoch) callbacks.append(profiler_callback) return callbacks
def test_time_history(self): th = keras_utils.TimeHistory(batch_size=128, log_steps=3) th.on_train_begin() th.on_batch_begin(0) th.on_batch_end(0) th.on_batch_begin(1) th.on_batch_end(1) th.on_batch_begin(2) th.on_batch_end(2) th.on_batch_begin(3) th.on_batch_end(3) th.on_batch_begin(4) th.on_batch_end(4) th.on_batch_begin(5) th.on_batch_end(5) th.on_batch_begin(6) th.on_batch_end(6) th.on_train_end() self.assertEqual(3, len(th.timestamp_log))
def main(_): # Users should always run this script under TF 2.x assert tf.version.VERSION.startswith('2.') with tf.io.gfile.GFile(FLAGS.input_meta_data_path, 'rb') as reader: input_meta_data = json.loads(reader.read().decode('utf-8')) if FLAGS.mode == 'export_only': export_squad(FLAGS.model_export_path, input_meta_data) return # Configures cluster spec for multi-worker distribution strategy. if FLAGS.num_gpus > 0: _ = distribution_utils.configure_cluster(FLAGS.worker_hosts, FLAGS.task_index) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, all_reduce_alg=FLAGS.all_reduce_alg, tpu_address=FLAGS.tpu) if FLAGS.mode in ('train', 'train_and_predict'): if FLAGS.log_steps: custom_callbacks = [ keras_utils.TimeHistory( batch_size=FLAGS.train_batch_size, log_steps=FLAGS.log_steps, logdir=FLAGS.model_dir, ) ] else: custom_callbacks = None train_squad( strategy, input_meta_data, custom_callbacks=custom_callbacks, run_eagerly=FLAGS.run_eagerly, ) if FLAGS.mode in ('predict', 'train_and_predict'): predict_squad(strategy, input_meta_data)
def test_build_stats(self): history = self._build_history(1.145, cat_accuracy=.99988) eval_output = self._build_eval_output(.56432111, 5.990) th = keras_utils.TimeHistory(128, 100) th.timestamp_log = [ keras_utils.BatchTimestamp(0, 1), keras_utils.BatchTimestamp(1, 2), keras_utils.BatchTimestamp(2, 3) ] th.train_finish_time = 12345 stats = common.build_stats(history, eval_output, [th]) self.assertEqual(1.145, stats['loss']) self.assertEqual(.99988, stats['training_accuracy_top_1']) self.assertEqual(.56432111, stats['accuracy_top_1']) self.assertEqual(5.990, stats['eval_loss']) self.assertEqual(3, stats['step_timestamp_log'][2].timestamp) self.assertEqual(12345, stats['train_finish_time'])
def run(args, strategy): """Pretrains model using TF2. Adapted from the tensorflow/models Github""" # CONFIG # Use timestamp to generate a unique run name run_name = get_run_name(args) logger.info(f'*** Starting run {run_name} ***') output_dir = f'gs://{args.bucket_name}/{args.project_name}/pretrain/runs/{run_name}' # pretrained model path try: pretrained_model_path = PRETRAINED_MODELS[args.model_class]['bucket_location'] except KeyError: raise ValueError(f'Could not find a pretrained model matching the model class {args.model_class}') pretrained_model_config_path = f'gs://{args.bucket_name}/{pretrained_model_path}/bert_config.json' if args.init_checkpoint is None: pretrained_model_checkpoint_path = f'gs://{args.bucket_name}/{pretrained_model_path}/bert_model.ckpt' else: pretrained_model_checkpoint_path = f'gs://{args.bucket_name}/{args.project_name}/pretrain/runs/{args.init_checkpoint}' # some logging logger.info(f'Running pretraining of model {args.model_class} on pretrain data {args.pretrain_data}') logger.info(f'Initializing model from checkpoint {pretrained_model_checkpoint_path}') # load model config based on model_class model_config = get_model_config(pretrained_model_config_path) # input data function train_input_fn = get_dataset_fn(args, _type='train') eval_input_fn = None eval_metric_fn = None if args.do_eval: logger.info(f'Setting up evaluation dataset') eval_metric_fn = get_eval_metric_fn eval_input_fn = get_dataset_fn(args, _type='dev') # model_fn def _get_pretrained_model(end_lr=0.0): """Gets a pretraining model.""" pretrain_model, core_model = bert_models.pretrain_model(model_config, args.max_seq_length, args.max_predictions_per_seq) if args.warmup_proportion is None: warmup_steps = args.warmup_steps warmup_proportion_perc = 100 * args.warmup_steps/(args.num_epochs * args.num_steps_per_epoch) else: warmup_steps = int(args.num_epochs * args.num_steps_per_epoch * args.warmup_proportion) warmup_proportion_perc = args.warmup_proportion * 100 logger.info(f'Running {warmup_steps:,} warmup steps ({warmup_proportion_perc:.2f}% warmup)') optimizer = utils.optimizer.create_optimizer( args.learning_rate, args.num_steps_per_epoch * args.num_epochs, warmup_steps, args.end_lr, args.optimizer_type) pretrain_model.optimizer = configure_optimizer(optimizer, use_float16=args.dtype == 'fp16', use_graph_rewrite=False) return pretrain_model, core_model # custom callbacks summary_dir = os.path.join(output_dir, 'summaries') time_history_callback = keras_utils.TimeHistory( batch_size=args.train_batch_size, log_steps=args.time_history_log_steps, logdir=summary_dir) custom_callbacks = [time_history_callback] # Save an initial version of the log file data = { 'created_at': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'run_name': run_name, 'num_train_steps': args.num_steps_per_epoch * args.num_epochs, 'eval_steps': args.eval_steps, 'model_dir': output_dir, 'output_dir': output_dir, **vars(args), } # write initial training log f_path_training_log = os.path.join(output_dir, 'run_logs.json') logger.info(f'Writing training preliminary log to {f_path_training_log}...') save_to_json(data, f_path_training_log) # run training loop logger.info(f'Run training for {args.num_epochs:,} epochs, {args.num_steps_per_epoch:,} steps each, processing {args.num_epochs*args.num_steps_per_epoch*args.train_batch_size:,} training examples in total...') time_start = time.time() model_training_utils.run_customized_training_loop( strategy=strategy, model_fn=_get_pretrained_model, loss_fn=get_loss_fn(), scale_loss=True, model_dir=output_dir, train_input_fn=train_input_fn, steps_per_epoch=args.num_steps_per_epoch, steps_per_loop=args.steps_per_loop, epochs=args.num_epochs, eval_input_fn=eval_input_fn, eval_steps=args.eval_steps, metric_fn=eval_metric_fn, init_checkpoint=pretrained_model_checkpoint_path, load_mlm_nsp_weights = args.load_mlm_nsp_weights, set_trainstep = args.set_trainstep, custom_callbacks=custom_callbacks, run_eagerly=False, sub_model_export_name='pretrained/bert_model', explicit_allreduce=False, pre_allreduce_callbacks=None, post_allreduce_callbacks=None) time_end = time.time() training_time_min = (time_end-time_start)/60 data['training_time_min'] = training_time_min logger.info(f'Finished training after {training_time_min:.1f} min') # Write to run directory logger.info(f'Writing final training log to {f_path_training_log}...') save_to_json(data, f_path_training_log) # Write bert config f_path_bert_config = os.path.join(output_dir, 'bert_config.json') logger.info(f'Writing BERT config to {f_path_bert_config}...') save_to_json(model_config.to_dict(), f_path_bert_config)
def run_ncf(_): """Run NCF training and eval with Keras.""" # TODO(seemuch): Support different train and eval batch sizes if FLAGS.eval_batch_size != FLAGS.batch_size: logging.warning( "The Keras implementation of NCF currently does not support batch_size " "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match " "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size) ) FLAGS.eval_batch_size = FLAGS.batch_size params = ncf_common.parse_flags(FLAGS) if params["keras_use_ctl"] and int(tf.__version__.split(".")[0]) == 1: logging.error( "Custom training loop only works with tensorflow 2.0 and above.") return # ncf_common rounds eval_batch_size (this is needed due to a reshape during # eval). This carries over that rounding to batch_size as well. This is the # per device batch size params["batch_size"] = params["eval_batch_size"] batch_size = params["batch_size"] num_users, num_items, num_train_steps, num_eval_steps, producer = ( ncf_common.get_inputs(params)) params["num_users"], params["num_items"] = num_users, num_items producer.start() model_helpers.apply_clean(flags.FLAGS) batches_per_step = params["batches_per_step"] train_input_dataset, eval_input_dataset = _get_train_and_eval_data(producer, params) # It is required that for distributed training, the dataset must call # batch(). The parameter of batch() here is the number of replicas involed, # such that each replica evenly gets a slice of data. train_input_dataset = train_input_dataset.batch(batches_per_step) eval_input_dataset = eval_input_dataset.batch(batches_per_step) time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) per_epoch_callback = IncrementEpochCallback(producer) callbacks = [per_epoch_callback, time_callback] if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_metric_fn", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) strategy = ncf_common.get_distribution_strategy(params) with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) if params["keras_use_ctl"]: loss_object = tf.losses.SparseCategoricalCrossentropy( reduction=tf.keras.losses.Reduction.SUM, from_logits=True) train_input_iterator = strategy.make_dataset_iterator(train_input_dataset) eval_input_iterator = strategy.make_dataset_iterator(eval_input_dataset) @tf.function def train_step(): """Called once per step to train the model.""" def step_fn(inputs): """Computes loss and applied gradient per replica.""" features, labels = inputs with tf.GradientTape() as tape: softmax_logits = keras_model(features) loss = loss_object(labels, softmax_logits, sample_weight=features[rconst.VALID_POINT_MASK]) loss *= (1.0 / (batch_size*strategy.num_replicas_in_sync)) grads = tape.gradient(loss, keras_model.trainable_variables) optimizer.apply_gradients(list(zip(grads, keras_model.trainable_variables))) return loss per_replica_losses = strategy.experimental_run(step_fn, train_input_iterator) mean_loss = strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss @tf.function def eval_step(): """Called once per eval step to compute eval metrics.""" def step_fn(inputs): """Computes eval metrics per replica.""" features, _ = inputs softmax_logits = keras_model(features) in_top_k, metric_weights = metric_fn( softmax_logits, features[rconst.DUPLICATE_MASK], params) hr_sum = tf.reduce_sum(in_top_k*metric_weights) hr_count = tf.reduce_sum(metric_weights) return hr_sum, hr_count per_replica_hr_sum, per_replica_hr_count = ( strategy.experimental_run(step_fn, eval_input_iterator)) hr_sum = strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None) hr_count = strategy.reduce( tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None) return hr_sum, hr_count time_callback.on_train_begin() for epoch in range(FLAGS.train_epochs): per_epoch_callback.on_epoch_begin(epoch) train_input_iterator.initialize() train_loss = 0 for step in range(num_train_steps): time_callback.on_batch_begin(step+epoch*num_train_steps) train_loss += train_step() time_callback.on_batch_end(step+epoch*num_train_steps) train_loss /= num_train_steps logging.info("Done training epoch %s, epoch loss=%s.", epoch+1, train_loss) eval_input_iterator.initialize() hr_sum = 0 hr_count = 0 for _ in range(num_eval_steps): step_hr_sum, step_hr_count = eval_step() hr_sum += step_hr_sum hr_count += step_hr_count logging.info("Done eval epoch %s, hr=%s.", epoch+1, hr_sum/hr_count) if (FLAGS.early_stopping and float(hr_sum/hr_count) > params["hr_threshold"]): break time_callback.on_train_end() eval_results = [None, hr_sum/hr_count] else: with distribution_utils.get_strategy_scope(strategy): keras_model.compile(optimizer=optimizer) history = keras_model.fit(train_input_dataset, steps_per_epoch=num_train_steps, epochs=FLAGS.train_epochs, callbacks=callbacks, validation_data=eval_input_dataset, validation_steps=num_eval_steps, verbose=2) logging.info("Training done. Start evaluating") eval_results = keras_model.evaluate( eval_input_dataset, steps=num_eval_steps, verbose=2) logging.info("Keras evaluation is done.") if history and history.history: train_history = history.history train_loss = train_history["loss"][-1] stats = build_stats(train_loss, eval_results, time_callback) return stats
def run_ncf(_): """Run NCF training and eval with Keras.""" keras_utils.set_session_config(enable_xla=FLAGS.enable_xla) if FLAGS.seed is not None: print("Setting tf seed") tf.random.set_seed(FLAGS.seed) params = ncf_common.parse_flags(FLAGS) model_helpers.apply_clean(flags.FLAGS) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu) params["distribute_strategy"] = strategy if not keras_utils.is_v2_0() and strategy is not None: logging.error( "NCF Keras only works with distribution strategy in TF 2.0") return if (params["keras_use_ctl"] and (not keras_utils.is_v2_0() or strategy is None)): logging.error( "Custom training loop only works with tensorflow 2.0 and dist strat." ) return if params["use_tpu"] and not params["keras_use_ctl"]: logging.error( "Custom training loop must be used when using TPUStrategy.") return batch_size = params["batch_size"] time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) callbacks = [time_callback] producer, input_meta_data = None, None generate_input_online = params["train_dataset_path"] is None if generate_input_online: # Start data producing thread. num_users, num_items, _, _, producer = ncf_common.get_inputs(params) producer.start() per_epoch_callback = IncrementEpochCallback(producer) callbacks.append(per_epoch_callback) else: assert params["eval_dataset_path"] and params["input_meta_data_path"] with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: input_meta_data = json.loads(reader.read().decode("utf-8")) num_users = input_meta_data["num_users"] num_items = input_meta_data["num_items"] params["num_users"], params["num_items"] = num_users, num_items if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_HR_METRIC", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) use_remote_tpu = params["use_tpu"] and FLAGS.tpu primary_cpu_task = tpu_lib.get_primary_cpu_task(use_remote_tpu) with tf.device(primary_cpu_task): (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = \ (ncf_input_pipeline.create_ncf_input_data( params, producer, input_meta_data, strategy)) steps_per_epoch = None if generate_input_online else num_train_steps with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) if FLAGS.dtype == "fp16": optimizer = \ tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) if params["keras_use_ctl"]: train_loss, eval_results = run_ncf_custom_training( params, strategy, keras_model, optimizer, callbacks, train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps, generate_input_online=generate_input_online) else: # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer # a valid arg for this model. Also remove as a valid flag. if FLAGS.force_v2_in_keras_compile is not None: keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly, experimental_run_tf_function=FLAGS. force_v2_in_keras_compile) else: keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly) history = keras_model.fit(train_input_dataset, epochs=FLAGS.train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_data=eval_input_dataset, validation_steps=num_eval_steps, verbose=2) logging.info("Training done. Start evaluating") eval_loss_and_metrics = keras_model.evaluate( eval_input_dataset, steps=num_eval_steps, verbose=2) logging.info("Keras evaluation is done.") # Keras evaluate() API returns scalar loss and metric values from # evaluation as a list. Here, the returned list would contain # [evaluation loss, hr sum, hr count]. eval_hit_rate = eval_loss_and_metrics[ 1] / eval_loss_and_metrics[2] # Format evaluation result into [eval loss, eval hit accuracy]. eval_results = [eval_loss_and_metrics[0], eval_hit_rate] if history and history.history: train_history = history.history train_loss = train_history["loss"][-1] stats = build_stats(train_loss, eval_results, time_callback) return stats
def run(flags_obj): """Run ResNet ImageNet training and eval loop using custom training loops. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) # TODO(anj-s): Set data_format without using Keras. data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, num_workers=distribution_utils.configure_cluster(), all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs) train_ds, test_ds = get_input_dataset(flags_obj, strategy) train_steps, train_epochs, eval_steps = get_num_train_iterations(flags_obj) time_callback = keras_utils.TimeHistory(flags_obj.batch_size, flags_obj.log_steps) strategy_scope = distribution_utils.get_strategy_scope(strategy) with strategy_scope: model = resnet_model.resnet50( num_classes=imagenet_preprocessing.NUM_CLASSES, batch_size=flags_obj.batch_size, use_l2_regularizer=not flags_obj.single_l2_loss_op) optimizer = tf.keras.optimizers.SGD( learning_rate=common.BASE_LEARNING_RATE, momentum=0.9, nesterov=True) if flags_obj.fp16_implementation == "graph_rewrite": if not flags_obj.use_tf_function: raise ValueError( "--fp16_implementation=graph_rewrite requires " "--use_tf_function to be true") loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale) training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'training_accuracy', dtype=tf.float32) test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32) test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'test_accuracy', dtype=tf.float32) trainable_variables = model.trainable_variables def train_step(train_ds_inputs): """Training StepFn.""" def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs with tf.GradientTape() as tape: logits = model(images, training=True) prediction_loss = tf.keras.losses.sparse_categorical_crossentropy( labels, logits) loss = tf.reduce_sum(prediction_loss) * ( 1.0 / flags_obj.batch_size) num_replicas = tf.distribute.get_strategy( ).num_replicas_in_sync if flags_obj.single_l2_loss_op: filtered_variables = [ tf.reshape(v, (-1, )) for v in trainable_variables if 'bn' not in v.name ] l2_loss = resnet_model.L2_WEIGHT_DECAY * 2 * tf.nn.l2_loss( tf.concat(filtered_variables, axis=0)) loss += (l2_loss / num_replicas) else: loss += (tf.reduce_sum(model.losses) / num_replicas) # Scale the loss if flags_obj.dtype == "fp16": loss = optimizer.get_scaled_loss(loss) grads = tape.gradient(loss, trainable_variables) # Unscale the grads if flags_obj.dtype == "fp16": grads = optimizer.get_unscaled_gradients(grads) optimizer.apply_gradients(zip(grads, trainable_variables)) training_accuracy.update_state(labels, logits) return loss if strategy: per_replica_losses = strategy.experimental_run_v2( step_fn, args=(train_ds_inputs, )) return strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) else: return step_fn(train_ds_inputs) def test_step(test_ds_inputs): """Evaluation StepFn.""" def step_fn(inputs): images, labels = inputs logits = model(images, training=False) loss = tf.keras.losses.sparse_categorical_crossentropy( labels, logits) loss = tf.reduce_sum(loss) * (1.0 / flags_obj.batch_size) test_loss.update_state(loss) test_accuracy.update_state(labels, logits) if strategy: strategy.experimental_run_v2(step_fn, args=(test_ds_inputs, )) else: step_fn(test_ds_inputs) if flags_obj.use_tf_function: train_step = tf.function(train_step) test_step = tf.function(test_step) time_callback.on_train_begin() for epoch in range(train_epochs): train_iter = iter(train_ds) total_loss = 0.0 training_accuracy.reset_states() for step in range(train_steps): optimizer.lr = common.learning_rate_schedule( epoch, step, train_steps, flags_obj.batch_size) time_callback.on_batch_begin(step + epoch * train_steps) total_loss += train_step(next(train_iter)) time_callback.on_batch_end(step + epoch * train_steps) train_loss = total_loss / train_steps logging.info('Training loss: %s, accuracy: %s%% at epoch: %d', train_loss.numpy(), training_accuracy.result().numpy(), epoch) if (not flags_obj.skip_eval and (epoch + 1) % flags_obj.epochs_between_evals == 0): test_loss.reset_states() test_accuracy.reset_states() test_iter = iter(test_ds) for _ in range(eval_steps): test_step(next(test_iter)) logging.info('Test loss: %s, accuracy: %s%% at epoch: %d', test_loss.result().numpy(), test_accuracy.result().numpy(), epoch) time_callback.on_train_end() eval_result = None train_result = None if not flags_obj.skip_eval: eval_result = [ test_loss.result().numpy(), test_accuracy.result().numpy() ] train_result = [ train_loss.numpy(), training_accuracy.result().numpy() ] stats = build_stats(train_result, eval_result, time_callback) return stats
def run(callbacks=None): keras_utils.set_session_config(enable_xla=FLAGS.enable_xla) params = config_factory.config_generator(FLAGS.model) params = params_dict.override_params_dict(params, FLAGS.config_file, is_strict=True) params = params_dict.override_params_dict(params, FLAGS.params_override, is_strict=True) params.override( { 'strategy_type': FLAGS.strategy_type, 'model_dir': FLAGS.model_dir, 'strategy_config': executor.strategy_flags_dict(), }, is_strict=False) # Make sure use_tpu and strategy_type are in sync. params.use_tpu = (params.strategy_type == 'tpu') if not params.use_tpu: params.override( { 'architecture': { 'use_bfloat16': False, }, 'norm_activation': { 'use_sync_bn': False, }, }, is_strict=True) params.validate() params.lock() pp = pprint.PrettyPrinter() params_str = pp.pformat(params.as_dict()) logging.info('Model Parameters: %s', params_str) train_input_fn = None eval_input_fn = None training_file_pattern = FLAGS.training_file_pattern or params.train.train_file_pattern eval_file_pattern = FLAGS.eval_file_pattern or params.eval.eval_file_pattern if not training_file_pattern and not eval_file_pattern: raise ValueError( 'Must provide at least one of training_file_pattern and ' 'eval_file_pattern.') if training_file_pattern: # Use global batch size for single host. train_input_fn = input_reader.InputFn( file_pattern=training_file_pattern, params=params, mode=input_reader.ModeKeys.TRAIN, batch_size=params.train.batch_size) if eval_file_pattern: eval_input_fn = input_reader.InputFn( file_pattern=eval_file_pattern, params=params, mode=input_reader.ModeKeys.PREDICT_WITH_GT, batch_size=params.eval.batch_size, num_examples=params.eval.eval_samples) if callbacks is None: callbacks = [] if FLAGS.log_steps: callbacks.append( keras_utils.TimeHistory( batch_size=params.train.batch_size, log_steps=FLAGS.log_steps, )) return run_executor(params, FLAGS.mode, checkpoint_path=FLAGS.checkpoint_path, train_input_fn=train_input_fn, eval_input_fn=eval_input_fn, callbacks=callbacks)
def run(flags_obj): """Run ResNet ImageNet training and eval loop using custom training loops. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) dtype = flags_core.get_tf_dtype(flags_obj) if dtype == tf.bfloat16: policy = tf.compat.v2.keras.mixed_precision.experimental.Policy( 'mixed_bfloat16') tf.compat.v2.keras.mixed_precision.experimental.set_policy(policy) # TODO(anj-s): Set data_format without using Keras. data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.test.is_built_with_cuda() else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, num_workers=distribution_utils.configure_cluster(), all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) train_ds, test_ds = get_input_dataset(flags_obj, strategy) per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations( flags_obj) steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps) logging.info( "Training %d epochs, each epoch has %d steps, " "total steps: %d; Eval %d steps", train_epochs, per_epoch_steps, train_epochs * per_epoch_steps, eval_steps) time_callback = keras_utils.TimeHistory(flags_obj.batch_size, flags_obj.log_steps) with distribution_utils.get_strategy_scope(strategy): model = resnet_model.resnet50( num_classes=imagenet_preprocessing.NUM_CLASSES, batch_size=flags_obj.batch_size, use_l2_regularizer=not flags_obj.single_l2_loss_op) lr_schedule = common.PiecewiseConstantDecayWithWarmup( batch_size=flags_obj.batch_size, epoch_size=imagenet_preprocessing.NUM_IMAGES['train'], warmup_epochs=common.LR_SCHEDULE[0][1], boundaries=list(p[1] for p in common.LR_SCHEDULE[1:]), multipliers=list(p[0] for p in common.LR_SCHEDULE), compute_lr_on_cpu=True) optimizer = common.get_optimizer(lr_schedule) if flags_obj.fp16_implementation == 'graph_rewrite': if not flags_obj.use_tf_function: raise ValueError( '--fp16_implementation=graph_rewrite requires ' '--use_tf_function to be true') loss_scale = flags_core.get_loss_scale(flags_obj, default_for_fp16=128) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale) train_loss = tf.keras.metrics.Mean('train_loss', dtype=tf.float32) training_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'training_accuracy', dtype=tf.float32) test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32) test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy( 'test_accuracy', dtype=tf.float32) trainable_variables = model.trainable_variables def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs with tf.GradientTape() as tape: logits = model(images, training=True) prediction_loss = tf.keras.losses.sparse_categorical_crossentropy( labels, logits) loss = tf.reduce_sum(prediction_loss) * (1.0 / flags_obj.batch_size) num_replicas = tf.distribute.get_strategy( ).num_replicas_in_sync if flags_obj.single_l2_loss_op: filtered_variables = [ tf.reshape(v, (-1, )) for v in trainable_variables if 'bn' not in v.name ] l2_loss = resnet_model.L2_WEIGHT_DECAY * 2 * tf.nn.l2_loss( tf.concat(filtered_variables, axis=0)) loss += (l2_loss / num_replicas) else: loss += (tf.reduce_sum(model.losses) / num_replicas) # Scale the loss if flags_obj.dtype == "fp16": loss = optimizer.get_scaled_loss(loss) grads = tape.gradient(loss, trainable_variables) # Unscale the grads if flags_obj.dtype == "fp16": grads = optimizer.get_unscaled_gradients(grads) optimizer.apply_gradients(zip(grads, trainable_variables)) train_loss.update_state(loss) training_accuracy.update_state(labels, logits) @tf.function def train_steps(iterator, steps): """Performs distributed training steps in a loop.""" for _ in tf.range(steps): strategy.experimental_run_v2(step_fn, args=(next(iterator), )) def train_single_step(iterator): if strategy: strategy.experimental_run_v2(step_fn, args=(next(iterator), )) else: return step_fn(next(iterator)) def test_step(iterator): """Evaluation StepFn.""" def step_fn(inputs): images, labels = inputs logits = model(images, training=False) loss = tf.keras.losses.sparse_categorical_crossentropy( labels, logits) loss = tf.reduce_sum(loss) * (1.0 / flags_obj.batch_size) test_loss.update_state(loss) test_accuracy.update_state(labels, logits) if strategy: strategy.experimental_run_v2(step_fn, args=(next(iterator), )) else: step_fn(next(iterator)) if flags_obj.use_tf_function: train_single_step = tf.function(train_single_step) test_step = tf.function(test_step) train_iter = iter(train_ds) time_callback.on_train_begin() for epoch in range(train_epochs): train_loss.reset_states() training_accuracy.reset_states() steps_in_current_epoch = 0 while steps_in_current_epoch < per_epoch_steps: time_callback.on_batch_begin(steps_in_current_epoch + epoch * per_epoch_steps) steps = _steps_to_run(steps_in_current_epoch, per_epoch_steps, steps_per_loop) if steps == 1: train_single_step(train_iter) else: # Converts steps to a Tensor to avoid tf.function retracing. train_steps(train_iter, tf.convert_to_tensor(steps, dtype=tf.int32)) time_callback.on_batch_end(steps_in_current_epoch + epoch * per_epoch_steps) steps_in_current_epoch += steps logging.info('Training loss: %s, accuracy: %s at epoch %d', train_loss.result().numpy(), training_accuracy.result().numpy(), epoch + 1) if (not flags_obj.skip_eval and (epoch + 1) % flags_obj.epochs_between_evals == 0): test_loss.reset_states() test_accuracy.reset_states() test_iter = iter(test_ds) for _ in range(eval_steps): test_step(test_iter) logging.info('Test loss: %s, accuracy: %s%% at epoch: %d', test_loss.result().numpy(), test_accuracy.result().numpy(), epoch + 1) time_callback.on_train_end() eval_result = None train_result = None if not flags_obj.skip_eval: eval_result = [ test_loss.result().numpy(), test_accuracy.result().numpy() ] train_result = [ train_loss.result().numpy(), training_accuracy.result().numpy() ] stats = build_stats(train_result, eval_result, time_callback) return stats
def run_bert(strategy, input_meta_data, model_config, train_input_fn=None, eval_input_fn=None): """Run BERT training.""" if FLAGS.mode == 'export_only': # As Keras ModelCheckpoint callback used with Keras compile/fit() API # internally uses model.save_weights() to save checkpoints, we must # use model.load_weights() when Keras compile/fit() is used. export_classifier(FLAGS.model_export_path, input_meta_data, FLAGS.use_keras_compile_fit, model_config, FLAGS.model_dir) return if FLAGS.mode != 'train_and_eval': raise ValueError('Unsupported mode is specified: %s' % FLAGS.mode) # Enables XLA in Session Config. Should not be set for TPU. keras_utils.set_config_v2(FLAGS.enable_xla) performance.set_mixed_precision_policy(common_flags.dtype()) epochs = FLAGS.num_train_epochs train_data_size = input_meta_data['train_data_size'] steps_per_epoch = int(train_data_size / FLAGS.train_batch_size) warmup_steps = int(epochs * train_data_size * 0.1 / FLAGS.train_batch_size) eval_steps = int( math.ceil(input_meta_data['eval_data_size'] / FLAGS.eval_batch_size)) if not strategy: raise ValueError('Distribution strategy has not been specified.') if FLAGS.log_steps: custom_callbacks = [ keras_utils.TimeHistory( batch_size=FLAGS.train_batch_size, log_steps=FLAGS.log_steps, logdir=FLAGS.model_dir, ) ] else: custom_callbacks = None trained_model = run_bert_classifier( strategy, model_config, input_meta_data, FLAGS.model_dir, epochs, steps_per_epoch, FLAGS.steps_per_loop, eval_steps, warmup_steps, FLAGS.learning_rate, FLAGS.init_checkpoint, train_input_fn, eval_input_fn, run_eagerly=FLAGS.run_eagerly, use_keras_compile_fit=FLAGS.use_keras_compile_fit, custom_callbacks=custom_callbacks) if FLAGS.model_export_path: # As Keras ModelCheckpoint callback used with Keras compile/fit() API # internally uses model.save_weights() to save checkpoints, we must # use model.load_weights() when Keras compile/fit() is used. model_saving_utils.export_bert_model( FLAGS.model_export_path, model=trained_model, restore_model_using_load_weights=FLAGS.use_keras_compile_fit) return trained_model
def run_ncf(_): """Run NCF training and eval with Keras.""" keras_utils.set_session_config(enable_xla=FLAGS.enable_xla) if FLAGS.seed is not None: print("Setting tf seed") tf.random.set_seed(FLAGS.seed) # TODO(seemuch): Support different train and eval batch sizes if FLAGS.eval_batch_size != FLAGS.batch_size: logging.warning( "The Keras implementation of NCF currently does not support batch_size " "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match " "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size)) FLAGS.eval_batch_size = FLAGS.batch_size params = ncf_common.parse_flags(FLAGS) model_helpers.apply_clean(flags.FLAGS) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus) params["distribute_strategy"] = strategy if not keras_utils.is_v2_0() and strategy is not None: logging.error( "NCF Keras only works with distribution strategy in TF 2.0") return if (params["keras_use_ctl"] and (not keras_utils.is_v2_0() or strategy is None)): logging.error( "Custom training loop only works with tensorflow 2.0 and dist strat." ) return # ncf_common rounds eval_batch_size (this is needed due to a reshape during # eval). This carries over that rounding to batch_size as well. This is the # per device batch size params["batch_size"] = params["eval_batch_size"] batch_size = params["batch_size"] time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) callbacks = [time_callback] producer, input_meta_data = None, None generate_input_online = params["train_dataset_path"] is None if generate_input_online: # Start data producing thread. num_users, num_items, num_train_steps, num_eval_steps, producer = ( ncf_common.get_inputs(params)) producer.start() per_epoch_callback = IncrementEpochCallback(producer) callbacks.append(per_epoch_callback) else: assert params["eval_dataset_path"] and params["input_meta_data_path"] with tf.gfile.GFile(params["input_meta_data_path"], "rb") as reader: input_meta_data = json.loads(reader.read().decode("utf-8")) num_users = input_meta_data["num_users"] num_items = input_meta_data["num_items"] params["num_users"], params["num_items"] = num_users, num_items (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = \ (ncf_input_pipeline.create_ncf_input_data( params, producer, input_meta_data)) steps_per_epoch = None if generate_input_online else num_train_steps if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_HR_METRIC", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) with distribution_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) if params["keras_use_ctl"]: loss_object = tf.keras.losses.SparseCategoricalCrossentropy( reduction="sum", from_logits=True) train_input_iterator = strategy.make_dataset_iterator( train_input_dataset) eval_input_iterator = strategy.make_dataset_iterator( eval_input_dataset) @tf.function def train_step(): """Called once per step to train the model.""" def step_fn(features): """Computes loss and applied gradient per replica.""" with tf.GradientTape() as tape: softmax_logits = keras_model(features) labels = features[rconst.TRAIN_LABEL_KEY] loss = loss_object( labels, softmax_logits, sample_weight=features[rconst.VALID_POINT_MASK]) loss *= (1.0 / (batch_size * strategy.num_replicas_in_sync)) grads = tape.gradient(loss, keras_model.trainable_variables) # Converting gradients to dense form helps in perf on GPU for NCF grads = neumf_model.sparse_to_dense_grads( list(zip(grads, keras_model.trainable_variables))) optimizer.apply_gradients(grads) return loss per_replica_losses = strategy.experimental_run( step_fn, train_input_iterator) mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_losses, axis=None) return mean_loss @tf.function def eval_step(): """Called once per eval step to compute eval metrics.""" def step_fn(features): """Computes eval metrics per replica.""" softmax_logits = keras_model(features) in_top_k, metric_weights = metric_fn( softmax_logits, features[rconst.DUPLICATE_MASK], params) hr_sum = tf.reduce_sum(in_top_k * metric_weights) hr_count = tf.reduce_sum(metric_weights) return hr_sum, hr_count per_replica_hr_sum, per_replica_hr_count = ( strategy.experimental_run(step_fn, eval_input_iterator)) hr_sum = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_hr_sum, axis=None) hr_count = strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_hr_count, axis=None) return hr_sum, hr_count time_callback.on_train_begin() for epoch in range(FLAGS.train_epochs): for cb in callbacks: cb.on_epoch_begin(epoch) # As NCF dataset is sampled with randomness, not repeating # data elements in each epoch has significant impact on # convergence. As so, offline-generated TF record files # contains all epoch worth of data. Thus we do not need # to initialize dataset when reading from tf record files. if generate_input_online: train_input_iterator.initialize() train_loss = 0 for step in range(num_train_steps): time_callback.on_batch_begin(step + epoch * num_train_steps) train_loss += train_step() time_callback.on_batch_end(step + epoch * num_train_steps) train_loss /= num_train_steps logging.info("Done training epoch %s, epoch loss=%s.", epoch + 1, train_loss) eval_input_iterator.initialize() hr_sum = 0 hr_count = 0 for _ in range(num_eval_steps): step_hr_sum, step_hr_count = eval_step() hr_sum += step_hr_sum hr_count += step_hr_count logging.info("Done eval epoch %s, hr=%s.", epoch + 1, hr_sum / hr_count) if (FLAGS.early_stopping and float(hr_sum / hr_count) > params["hr_threshold"]): break time_callback.on_train_end() eval_results = [None, hr_sum / hr_count] else: with distribution_utils.get_strategy_scope(strategy): keras_model.compile( optimizer=optimizer, run_eagerly=FLAGS.run_eagerly, run_distributed=FLAGS.force_v2_in_keras_compile) history = keras_model.fit(train_input_dataset, epochs=FLAGS.train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_data=eval_input_dataset, validation_steps=num_eval_steps, verbose=2) logging.info("Training done. Start evaluating") eval_results = keras_model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) logging.info("Keras evaluation is done.") if history and history.history: train_history = history.history train_loss = train_history["loss"][-1] stats = build_stats(train_loss, eval_results, time_callback) return stats
def run_ncf(_): """Run NCF training and eval with Keras.""" keras_utils.set_session_config(enable_xla=FLAGS.enable_xla) if FLAGS.seed is not None: print("Setting tf seed") tf.random.set_seed(FLAGS.seed) model_helpers.apply_clean(FLAGS) if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras": tf.keras.mixed_precision.set_global_policy("mixed_float16") strategy = distribute_utils.get_distribution_strategy( distribution_strategy=FLAGS.distribution_strategy, num_gpus=FLAGS.num_gpus, tpu_address=FLAGS.tpu) params = ncf_common.parse_flags(FLAGS) params["distribute_strategy"] = strategy params["use_tpu"] = (FLAGS.distribution_strategy == "tpu") if params["use_tpu"] and not params["keras_use_ctl"]: logging.error( "Custom training loop must be used when using TPUStrategy.") return batch_size = params["batch_size"] time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps) callbacks = [time_callback] producer, input_meta_data = None, None generate_input_online = params["train_dataset_path"] is None if generate_input_online: # Start data producing thread. num_users, num_items, _, _, producer = ncf_common.get_inputs(params) producer.start() per_epoch_callback = IncrementEpochCallback(producer) callbacks.append(per_epoch_callback) else: assert params["eval_dataset_path"] and params["input_meta_data_path"] with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader: input_meta_data = json.loads(reader.read().decode("utf-8")) num_users = input_meta_data["num_users"] num_items = input_meta_data["num_items"] params["num_users"], params["num_items"] = num_users, num_items if FLAGS.early_stopping: early_stopping_callback = CustomEarlyStopping( "val_HR_METRIC", desired_value=FLAGS.hr_threshold) callbacks.append(early_stopping_callback) (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = \ (ncf_input_pipeline.create_ncf_input_data( params, producer, input_meta_data, strategy)) steps_per_epoch = None if generate_input_online else num_train_steps with distribute_utils.get_strategy_scope(strategy): keras_model = _get_keras_model(params) optimizer = tf.keras.optimizers.Adam( learning_rate=params["learning_rate"], beta_1=params["beta1"], beta_2=params["beta2"], epsilon=params["epsilon"]) if FLAGS.fp16_implementation == "graph_rewrite": optimizer = \ tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite( optimizer, loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic")) elif FLAGS.dtype == "fp16": loss_scale = flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic") # Note Model.compile automatically wraps the optimizer with a # LossScaleOptimizer using dynamic loss scaling. We explicitly wrap it # here for the case where a custom training loop or fixed loss scale is # used. if loss_scale == "dynamic": optimizer = tf.keras.mixed_precision.LossScaleOptimizer( optimizer) else: optimizer = tf.keras.mixed_precision.LossScaleOptimizer( optimizer, dynamic=False, initial_scale=loss_scale) if params["keras_use_ctl"]: train_loss, eval_results = run_ncf_custom_training( params, strategy, keras_model, optimizer, callbacks, train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps, generate_input_online=generate_input_online) else: keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly) if not FLAGS.ml_perf: # Create Tensorboard summary and checkpoint callbacks. summary_dir = os.path.join(FLAGS.model_dir, "summaries") summary_callback = tf.keras.callbacks.TensorBoard(summary_dir) checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint") checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( checkpoint_path, save_weights_only=True) callbacks += [summary_callback, checkpoint_callback] history = keras_model.fit(train_input_dataset, epochs=FLAGS.train_epochs, steps_per_epoch=steps_per_epoch, callbacks=callbacks, validation_data=eval_input_dataset, validation_steps=num_eval_steps, verbose=2) logging.info("Training done. Start evaluating") eval_loss_and_metrics = keras_model.evaluate(eval_input_dataset, steps=num_eval_steps, verbose=2) logging.info("Keras evaluation is done.") # Keras evaluate() API returns scalar loss and metric values from # evaluation as a list. Here, the returned list would contain # [evaluation loss, hr sum, hr count]. eval_hit_rate = eval_loss_and_metrics[1] / eval_loss_and_metrics[2] # Format evaluation result into [eval loss, eval hit accuracy]. eval_results = [eval_loss_and_metrics[0], eval_hit_rate] if history and history.history: train_history = history.history train_loss = train_history["loss"][-1] stats = build_stats(train_loss, eval_results, time_callback) return stats
def run(flags_obj): """Run ResNet ImageNet training and eval loop using custom training loops. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config(enable_eager=flags_obj.enable_eager, enable_xla=flags_obj.enable_xla) performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj)) # This only affects GPU. common.set_cudnn_batchnorm_mode() # TODO(anj-s): Set data_format without using Keras. data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.config.list_physical_devices('GPU') else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribution_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations( flags_obj) steps_per_loop = min(flags_obj.steps_per_loop, per_epoch_steps) logging.info( 'Training %d epochs, each epoch has %d steps, ' 'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps, train_epochs * per_epoch_steps, eval_steps) time_callback = keras_utils.TimeHistory( flags_obj.batch_size, flags_obj.log_steps, logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None) with distribution_utils.get_strategy_scope(strategy): runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback, per_epoch_steps) eval_interval = flags_obj.epochs_between_evals * per_epoch_steps checkpoint_interval = (per_epoch_steps if flags_obj.enable_checkpoint_and_export else None) summary_interval = per_epoch_steps if flags_obj.enable_tensorboard else None checkpoint_manager = tf.train.CheckpointManager( runnable.checkpoint, directory=flags_obj.model_dir, max_to_keep=10, step_counter=runnable.global_step, checkpoint_interval=checkpoint_interval) resnet_controller = controller.Controller( strategy, runnable.train, runnable.evaluate, global_step=runnable.global_step, steps_per_loop=steps_per_loop, train_steps=per_epoch_steps * train_epochs, checkpoint_manager=checkpoint_manager, summary_interval=summary_interval, eval_steps=eval_steps, eval_interval=eval_interval) time_callback.on_train_begin() resnet_controller.train(evaluate=not flags_obj.skip_eval) time_callback.on_train_end() stats = build_stats(runnable, time_callback) return stats
def run(flags_obj): """Run ResNet ImageNet training and eval loop using custom training loops. Args: flags_obj: An object containing parsed flag values. Raises: ValueError: If fp16 is passed as it is not currently supported. Returns: Dictionary of training and eval stats. """ keras_utils.set_session_config() performance.set_mixed_precision_policy(flags_core.get_tf_dtype(flags_obj)) if tf.config.list_physical_devices('GPU'): if flags_obj.tf_gpu_thread_mode: keras_utils.set_gpu_thread_mode_and_count( per_gpu_thread_count=flags_obj.per_gpu_thread_count, gpu_thread_mode=flags_obj.tf_gpu_thread_mode, num_gpus=flags_obj.num_gpus, datasets_num_private_threads=flags_obj. datasets_num_private_threads) common.set_cudnn_batchnorm_mode() data_format = flags_obj.data_format if data_format is None: data_format = ('channels_first' if tf.config.list_physical_devices('GPU') else 'channels_last') tf.keras.backend.set_image_data_format(data_format) strategy = distribute_utils.get_distribution_strategy( distribution_strategy=flags_obj.distribution_strategy, num_gpus=flags_obj.num_gpus, all_reduce_alg=flags_obj.all_reduce_alg, num_packs=flags_obj.num_packs, tpu_address=flags_obj.tpu) per_epoch_steps, train_epochs, eval_steps = get_num_train_iterations( flags_obj) if flags_obj.steps_per_loop is None: steps_per_loop = per_epoch_steps elif flags_obj.steps_per_loop > per_epoch_steps: steps_per_loop = per_epoch_steps logging.warn('Setting steps_per_loop to %d to respect epoch boundary.', steps_per_loop) else: steps_per_loop = flags_obj.steps_per_loop logging.info( 'Training %d epochs, each epoch has %d steps, ' 'total steps: %d; Eval %d steps', train_epochs, per_epoch_steps, train_epochs * per_epoch_steps, eval_steps) time_callback = keras_utils.TimeHistory( flags_obj.batch_size, flags_obj.log_steps, logdir=flags_obj.model_dir if flags_obj.enable_tensorboard else None) with distribute_utils.get_strategy_scope(strategy): runnable = resnet_runnable.ResnetRunnable(flags_obj, time_callback, per_epoch_steps) eval_interval = flags_obj.epochs_between_evals * per_epoch_steps checkpoint_interval = (steps_per_loop * 5 if flags_obj.enable_checkpoint_and_export else None) summary_interval = steps_per_loop if flags_obj.enable_tensorboard else None checkpoint_manager = tf.train.CheckpointManager( runnable.checkpoint, directory=flags_obj.model_dir, max_to_keep=10, step_counter=runnable.global_step, checkpoint_interval=checkpoint_interval) resnet_controller = orbit.Controller( strategy=strategy, trainer=runnable, evaluator=runnable if not flags_obj.skip_eval else None, global_step=runnable.global_step, steps_per_loop=steps_per_loop, checkpoint_manager=checkpoint_manager, summary_interval=summary_interval, summary_dir=flags_obj.model_dir, eval_summary_dir=os.path.join(flags_obj.model_dir, 'eval')) time_callback.on_train_begin() if not flags_obj.skip_eval: resnet_controller.train_and_evaluate(train_steps=per_epoch_steps * train_epochs, eval_steps=eval_steps, eval_interval=eval_interval) else: resnet_controller.train(steps=per_epoch_steps * train_epochs) time_callback.on_train_end() stats = build_stats(runnable, time_callback) return stats