def input_fn_train(): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=flags_obj.train_epochs, num_gpus=flags_core.get_num_gpus(flags_obj), dtype=flags_core.get_tf_dtype(flags_obj))
def test(_): tf.enable_eager_execution() flag_obj = define_coco_flags() cocodataset = coco_dataset.CocoDataset() cocodataset.load_coco('/home/hume/Deep-learning/dataset/coco', 'train', DEFAULT_DATASET_YEAR) cocodataset.prepare() augmentation = imgaug.augmenters.Fliplr(0.5) input_iter = input_fn( cocodataset, is_training=True, batch_size=distribution_utils.per_device_batch_size( flag_obj.batch_size, flags_core.get_num_gpus(flag_obj)), anchors_path=flag_obj.anchors_path, num_epochs=flag_obj.train_epochs, dtype=tf.float32, max_num_boxes_per_image=flag_obj.max_num_boxes_per_image, image_size=flag_obj.image_size, augmentation=augmentation, num_parallel_batches=flag_obj.datasets_num_parallel_batches, datasets_num_private_threads=multiprocessing.cpu_count() - 3) coco_iter = input_iter.make_one_shot_iterator() starttime = time() imgs, y_gt = coco_iter.get_next() print('cost {}ms\n'.format((time() - starttime) * 1000)) print(imgs.shape) print(y_gt.shape)
def input_fn_train(num_epochs): return input_function( data_set=dataset, is_training=True, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), anchors_path=flags_obj.anchors_path, num_epochs=num_epochs, augmentation=augmentation, dtype=tf.float32, max_num_boxes_per_image=flags_obj.max_num_boxes_per_image, image_size=flags_obj.image_size, datasets_num_private_threads=flags_obj. datasets_num_private_threads, num_parallel_batches=flags_obj.datasets_num_parallel_batches)
def construct_estimator(flags_obj, params, schedule_manager): """Construct an estimator from either Estimator or TPUEstimator. Args: flags_obj: The FLAGS object parsed from command line. params: A dict of run specific parameters. schedule_manager: A schedule.Manager object containing the run schedule. Returns: An estimator object to be used for training and eval. """ if not params["use_tpu"]: distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) return tf.estimator.Estimator( model_fn=model_fn, model_dir=flags_obj.model_dir, params=params, config=tf.estimator.RunConfig(train_distribute=distribution_strategy)) tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( tpu=flags_obj.tpu, zone=flags_obj.tpu_zone, project=flags_obj.tpu_gcp_project ) tpu_config = tf.contrib.tpu.TPUConfig( iterations_per_loop=schedule_manager.single_iteration_train_steps, num_shards=flags_obj.num_tpu_shards) run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, model_dir=flags_obj.model_dir, session_config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=True), tpu_config=tpu_config) return tf.contrib.tpu.TPUEstimator( model_fn=model_fn, use_tpu=params["use_tpu"] and flags_obj.tpu != tpu_util.LOCAL, train_batch_size=schedule_manager.batch_size, eval_batch_size=schedule_manager.batch_size, params={ # TPUEstimator needs to populate batch_size itself due to sharding. key: value for key, value in params.items() if key != "batch_size"}, config=run_config)
def __init__(self): anchors = utils.get_anchors(flags.FLAGS.anchors_path) num_anchors = len(anchors) anchors = np.array(anchors, dtype=np.float32) super(CocoModel, self).__init__( image_size=flags.FLAGS.image_size, image_channels=flags.FLAGS.image_channels, num_classes=flags.FLAGS.num_classes, anchors=anchors, batch_size=distribution_utils.per_device_batch_size( flags.FLAGS.batch_size, flags_core.get_num_gpus(flags.FLAGS)), num_anchors=num_anchors, learning_rate=flags.FLAGS.learning_rate, backbone=flags.FLAGS.backbone, norm=flags.FLAGS.norm, threshold=flags.FLAGS.threshold, max_num_boxes_per_image=flags.FLAGS.max_num_boxes_per_image, confidence_score=flags.FLAGS.confidence_score, data_format=flags.FLAGS.data_format, dtype=flags_core.get_tf_dtype(flags.FLAGS))
def run_transformer(flags_obj): """Create tf.Estimator to train and evaluate transformer model. Args: flags_obj: Object containing parsed flag values. """ num_gpus = flags_core.get_num_gpus(flags_obj) # Add flag-defined parameters to params object params = PARAMS_MAP[flags_obj.param_set] if num_gpus > 1: if flags_obj.param_set == "big": params = model_params.BIG_MULTI_GPU_PARAMS elif flags_obj.param_set == "base": params = model_params.BASE_MULTI_GPU_PARAMS params["data_dir"] = flags_obj.data_dir params["model_dir"] = flags_obj.model_dir params["num_parallel_calls"] = flags_obj.num_parallel_calls params["tpu"] = flags_obj.tpu params["use_tpu"] = bool(flags_obj.tpu) # was a tpu specified. params["static_batch"] = flags_obj.static_batch or params["use_tpu"] params["allow_ffn_pad"] = not params["use_tpu"] params["use_synthetic_data"] = flags_obj.use_synthetic_data # Set batch size parameter, which depends on the availability of # TPU and GPU, and distribution settings. params["batch_size"] = (flags_obj.batch_size or ( params["default_batch_size_tpu"] if params["use_tpu"] else params["default_batch_size"])) if not params["use_tpu"]: params["batch_size"] = distribution_utils.per_device_batch_size( params["batch_size"], num_gpus) schedule_manager = schedule.Manager( train_steps=flags_obj.train_steps, steps_between_evals=flags_obj.steps_between_evals, train_epochs=flags_obj.train_epochs, epochs_between_evals=flags_obj.epochs_between_evals, default_train_epochs=DEFAULT_TRAIN_EPOCHS, batch_size=params["batch_size"], max_length=params["max_length"], use_tpu=params["use_tpu"], num_tpu_shards=flags_obj.num_tpu_shards ) params["repeat_dataset"] = schedule_manager.repeat_dataset model_helpers.apply_clean(flags.FLAGS) # Create hooks that log information about the training and metric values train_hooks = hooks_helper.get_train_hooks( flags_obj.hooks, model_dir=flags_obj.model_dir, tensors_to_log=TENSORS_TO_LOG, # used for logging hooks batch_size=schedule_manager.batch_size, # for ExamplesPerSecondHook use_tpu=params["use_tpu"] # Not all hooks can run with TPUs ) benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info( model_name="transformer", dataset_name="wmt_translate_ende", run_params=params, test_id=flags_obj.benchmark_test_id) # Train and evaluate transformer model estimator = construct_estimator(flags_obj, params, schedule_manager) run_loop( estimator=estimator, # Training arguments schedule_manager=schedule_manager, train_hooks=train_hooks, benchmark_logger=benchmark_logger, # BLEU calculation arguments bleu_source=flags_obj.bleu_source, bleu_ref=flags_obj.bleu_ref, bleu_threshold=flags_obj.stop_threshold, vocab_file=flags_obj.vocab_file) if flags_obj.export_dir and not params["use_tpu"]: serving_input_fn = export.build_tensor_serving_input_receiver_fn( shape=[None], dtype=tf.int64, batch_size=None) # Export saved model, and save the vocab file as an extra asset. The vocab # file is saved to allow consistent input encoding and output decoding. # (See the "Export trained model" section in the README for an example of # how to use the vocab file.) # Since the model itself does not use the vocab file, this file is saved as # an extra asset rather than a core asset. estimator.export_savedmodel( flags_obj.export_dir, serving_input_fn, assets_extra={"vocab.txt": flags_obj.vocab_file})
def resnet_main(flags_obj, model_function, input_function, dataset_name, shape=None): """Shared main loop for ResNet Models. Args: flags_obj: An object containing parsed flags. See define_resnet_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset_name: the name of the dataset for training and evaluation. This is used for logging purpose. shape: list of ints representing the shape of the images used for training. This is only used if flags_obj.export_dir is passed. """ model_helpers.apply_clean(flags.FLAGS) # Using the Winograd non-fused algorithms provides a small performance boost. os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1' # Create session config based on values of inter_op_parallelism_threads and # intra_op_parallelism_threads. Note that we default to having # allow_soft_placement = True, which is required for multi-GPU and not # harmful for other modes. session_config = tf.ConfigProto( inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) distribution_strategy = None if flags_obj.distribution_strategy == 'ps': print('==> Using ParameterServerStrategy') distribution_strategy = tf.contrib.distribute.ParameterServerStrategy( num_gpus_per_worker=flags_core.get_num_gpus(flags_obj)) elif flags_obj.distribution_strategy == 'allreduce': print('==> Using CollectiveAllReduceStrategy') distribution_strategy = tf.contrib.distribute.CollectiveAllReduceStrategy( num_gpus_per_worker=flags_core.get_num_gpus(flags_obj)) elif flags_obj.distribution_strategy == 'mirror': print('==> Using MirroredStrategy') distribution_strategy = tf.contrib.distribute.MirroredStrategy( num_gpus_per_worker=flags_core.get_num_gpus(flags_obj)) else: print("==> Distribution Strategy {} is not valid".format( flags_obj.distribution_strategy)) run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config, protocol="grpc+verbs", log_step_count_steps=1000) tf.logging.info("num_worker={}, batch_size={}, train_epochs={}".format( run_config.num_worker_replicas, flags_obj.batch_size, flags_obj.train_epochs)) # initialize our model with all but the dense layer from pretrained resnet if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None classifier = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'resnet_size': int(flags_obj.resnet_size), 'data_format': flags_obj.data_format, 'batch_size': flags_obj.batch_size, 'resnet_version': int(flags_obj.resnet_version), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune, 'num_workers': run_config.num_worker_replicas }) run_params = { 'batch_size': flags_obj.batch_size * run_config.num_worker_replicas, 'dtype': flags_core.get_tf_dtype(flags_obj), 'resnet_size': flags_obj.resnet_size, 'resnet_version': flags_obj.resnet_version, 'synthetic_data': flags_obj.use_synthetic_data, 'train_epochs': flags_obj.train_epochs, } if flags_obj.use_synthetic_data: dataset_name = dataset_name + '-synthetic' benchmark_logger = logger.get_benchmark_logger() benchmark_logger.log_run_info('resnet', dataset_name, run_params, test_id=flags_obj.benchmark_test_id) train_hooks = hooks_helper.get_train_hooks(flags_obj.hooks, model_dir=flags_obj.model_dir, batch_size=flags_obj.batch_size) def input_fn_train(): return input_function( is_training=True, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=flags_obj.train_epochs, num_gpus=flags_core.get_num_gpus(flags_obj), dtype=flags_core.get_tf_dtype(flags_obj)) def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_tf_dtype(flags_obj)) train_spec = tf.estimator.TrainSpec(input_fn=input_fn_train, hooks=train_hooks) eval_spec = tf.estimator.EvalSpec(input_fn=input_fn_eval, throttle_secs=1800, steps=None, start_delay_secs=10) if flags_obj.eval == 0: tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec) else: while True: eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=50000 // flags_obj.batch_size) time.sleep(flags_obj.eval) ''' if flags_obj.eval_only or not flags_obj.train_epochs: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals) schedule = [flags_obj.epochs_between_evals for _ in range(int(n_loops))] schedule[-1] = flags_obj.train_epochs - sum(schedule[:-1]) # over counting. for cycle_index, num_train_epochs in enumerate(schedule): tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if num_train_epochs: classifier.train(input_fn=lambda: input_fn_train(num_train_epochs), hooks=train_hooks, max_steps=flags_obj.max_train_steps) tf.logging.info('Starting to evaluate.') # flags_obj.max_train_steps is generally associated with testing and # profiling. As a result it is frequently called with synthetic data, which # will iterate forever. Passing steps=flags_obj.max_train_steps allows the # eval (which is generally unimportant in those circumstances) to terminate. # Note that eval will run for max_train_steps each loop, regardless of the # global_step count. eval_results = classifier.evaluate(input_fn=input_fn_eval, steps=flags_obj.max_train_steps) benchmark_logger.log_evaluation_result(eval_results) if model_helpers.past_stop_threshold( flags_obj.stop_threshold, eval_results['accuracy']): break ''' if flags_obj.export_dir is not None: # Exports a saved model for the given classifier. if flags_obj.image_bytes_as_serving_input: input_receiver_fn = functools.partial(image_bytes_serving_input_fn, shape) else: input_receiver_fn = export.build_tensor_serving_input_receiver_fn( shape, batch_size=flags_obj.batch_size) classifier.export_savedmodel(flags_obj.export_dir, input_receiver_fn)
def yolo_main(flags_obj, model_function, input_function, dataset, augmentation): """Shared main loop for yolo Models. Args: flags_obj: An object containing parsed flags. See define_yolo_flags() for details. model_function: the function that instantiates the Model and builds the ops for train/eval. This will be passed directly into the estimator. input_function: the function that processes the dataset and returns a dataset that the estimator can train on. This will be wrapped with all the relevant flags for running and passed to estimator. dataset: A dataset for training and evaluation. augmentation: Optional. An imgaug (https://github.com/aleju/imgaug) augmentation. For example, passing imgaug.augmenters.Fliplr(0.5) flips images right/left 50% of the time. """ model_helpers.apply_clean(flags_obj) # Ensures flag override logic is only executed if explicitly triggered. if flags_obj.tf_gpu_thread_mode: override_flags_and_set_envars_for_gpu_thread_pool(flags_obj) # Creates session config. allow_soft_placement = True, is required for # multi-GPU and is not harmful for other modes. session_config = tf.ConfigProto( log_device_placement=True, inter_op_parallelism_threads=flags_obj.inter_op_parallelism_threads, intra_op_parallelism_threads=flags_obj.intra_op_parallelism_threads, allow_soft_placement=True) session_config.gpu_options.allow_growth = True distribution_strategy = distribution_utils.get_distribution_strategy( flags_core.get_num_gpus(flags_obj), flags_obj.all_reduce_alg) run_config = tf.estimator.RunConfig(train_distribute=distribution_strategy, session_config=session_config, save_checkpoints_secs=60 * 60 * 24) # Initializes model with all but the dense layer from pretrained ResNet. if flags_obj.pretrained_model_checkpoint_path is not None: warm_start_settings = tf.estimator.WarmStartSettings( flags_obj.pretrained_model_checkpoint_path, vars_to_warm_start='^(?!.*dense)') else: warm_start_settings = None anchors = np.array(utils.get_anchors(flags_obj.anchors_path)) detector = tf.estimator.Estimator( model_fn=model_function, model_dir=flags_obj.model_dir, config=run_config, warm_start_from=warm_start_settings, params={ 'num_classes': flags_obj.num_classes, 'data_format': flags_obj.data_format, 'batch_size': distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), 'image_size': int(flags_obj.image_size), 'loss_scale': flags_core.get_loss_scale(flags_obj), 'dtype': flags_core.get_tf_dtype(flags_obj), 'fine_tune': flags_obj.fine_tune, 'anchors': anchors, 'num_anchors': len(anchors), 'max_num_boxes_per_image': flags_obj.max_num_boxes_per_image, 'threshold': flags_obj.threshold, 'train': dataset.num_images, 'learning_rate': flags_obj.learning_rate }) # if flags_obj.use_synthetic_data: # dataset_name = dataset_name + '-synthetic' def input_fn_train(num_epochs): return input_function( data_set=dataset, is_training=True, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), anchors_path=flags_obj.anchors_path, num_epochs=num_epochs, augmentation=augmentation, dtype=tf.float32, max_num_boxes_per_image=flags_obj.max_num_boxes_per_image, image_size=flags_obj.image_size, datasets_num_private_threads=flags_obj. datasets_num_private_threads, num_parallel_batches=flags_obj.datasets_num_parallel_batches) ''' def input_fn_eval(): return input_function( is_training=False, data_dir=flags_obj.data_dir, batch_size=distribution_utils.per_device_batch_size( flags_obj.batch_size, flags_core.get_num_gpus(flags_obj)), num_epochs=1, dtype=flags_core.get_tf_dtype(flags_obj)) ''' if flags_obj.eval_only or not flags_obj.train_epochs: # If --eval_only is set, perform a single loop with zero train epochs. schedule, n_loops = [0], 1 else: # Compute the number of times to loop while training. All but the last # pass will train for `epochs_between_evals` epochs, while the last will # train for the number needed to reach `training_epochs`. For instance if # train_epochs = 25 and epochs_between_evals = 10 # schedule will be set to [10, 10, 5]. That is to say, the loop will: # Train for 10 epochs and then evaluate. # Train for another 10 epochs and then evaluate. # Train for a final 5 epochs (to reach 25 epochs) and then evaluate. n_loops = math.ceil(flags_obj.train_epochs / flags_obj.epochs_between_evals) schedule = [ flags_obj.epochs_between_evals for _ in range(int(n_loops)) ] schedule[-1] = flags_obj.train_epochs - sum( schedule[:-1]) # over counting. for cycle_index, num_train_epochs in enumerate(schedule): tf.logging.info('Starting cycle: %d/%d', cycle_index, int(n_loops)) if num_train_epochs: detector.train(input_fn=lambda: input_fn_train(num_train_epochs), max_steps=flags_obj.max_train_steps) '''