Esempio n. 1
0
def main(_):
    mode = FLAGS.mode
    data_dir = FLAGS.data_dir
    model_dir = FLAGS.model_dir
    hparams = build_hparams()

    estimator_parmas = {}

    train_steps_per_epoch = int(
        math.ceil(hparams.num_train_images / float(hparams.train_batch_size)))
    eval_steps = hparams.num_eval_images // hparams.eval_batch_size
    eval_batch_size = (None if mode == 'train' else hparams.eval_batch_size)

    model = model_lib.AmoebaNetEstimatorModel(hparams, model_dir)

    if hparams.use_tpu:
        run_config = build_run_config()
        # Temporary treatment until flags are released.
        image_classifier = contrib_tpu.TPUEstimator(
            model_fn=model.model_fn,
            use_tpu=True,
            config=run_config,
            params=estimator_parmas,
            predict_batch_size=eval_batch_size,
            train_batch_size=hparams.train_batch_size,
            eval_batch_size=eval_batch_size,
            export_to_tpu=FLAGS.export_to_tpu)
    else:
        save_checkpoints_steps = (FLAGS.save_checkpoints_steps
                                  or FLAGS.iterations_per_loop)
        run_config = tf.estimator.RunConfig(
            model_dir=FLAGS.model_dir,
            save_checkpoints_steps=save_checkpoints_steps)
        image_classifier = tf.estimator.Estimator(model_fn=model.model_fn,
                                                  config=run_config,
                                                  params=estimator_parmas)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train = model_lib.InputPipeline(is_training=True,
                                             data_dir=data_dir,
                                             hparams=hparams)
    imagenet_eval = model_lib.InputPipeline(is_training=False,
                                            data_dir=data_dir,
                                            hparams=hparams)

    if hparams.moving_average_decay < 1:
        eval_hooks = [
            model_lib.LoadEMAHook(model_dir, hparams.moving_average_decay)
        ]
    else:
        eval_hooks = []

    if mode == 'eval':
        for checkpoint in _get_next_checkpoint():
            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = image_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    hooks=eval_hooks,
                    checkpoint_path=checkpoint)
                tf.logging.info('Evaluation results: %s' % eval_results)
            except tf.errors.NotFoundError:
                # skip checkpoint if it gets deleted prior to evaluation
                tf.logging.info('Checkpoint %s no longer exists ... skipping')
    elif mode == 'train_and_eval':
        current_step = _load_global_step_from_checkpoint_dir(model_dir)
        tf.logging.info('Starting training at step=%d.' % current_step)
        train_steps_per_eval = int(hparams.num_epochs_per_eval *
                                   train_steps_per_epoch)
        # Final Evaluation if training is finished.
        if current_step >= hparams.num_epochs * train_steps_per_epoch:
            eval_results = image_classifier.evaluate(
                input_fn=imagenet_eval.input_fn,
                steps=eval_steps,
                hooks=eval_hooks)
            tf.logging.info('Evaluation results: %s' % eval_results)
        while current_step < hparams.num_epochs * train_steps_per_epoch:
            image_classifier.train(input_fn=imagenet_train.input_fn,
                                   steps=train_steps_per_eval)
            current_step += train_steps_per_eval
            tf.logging.info('Starting evaluation at step=%d.' % current_step)
            eval_results = image_classifier.evaluate(
                input_fn=imagenet_eval.input_fn,
                steps=eval_steps,
                hooks=eval_hooks)
            tf.logging.info('Evaluation results: %s' % eval_results)
    elif mode == 'predict':
        for checkpoint in _get_next_checkpoint():
            tf.logging.info('Starting prediction ...')
            time_hook = model_lib.SessionTimingHook()
            eval_hooks.append(time_hook)
            result_iter = image_classifier.predict(
                input_fn=imagenet_eval.input_fn,
                hooks=eval_hooks,
                checkpoint_path=checkpoint,
                yield_single_examples=False)
            results = list(itertools.islice(result_iter, eval_steps))
            tf.logging.info('Inference speed = {} images per second.'.format(
                time_hook.compute_speed(len(results) * eval_batch_size)))
    elif mode == 'train':
        current_step = _load_global_step_from_checkpoint_dir(model_dir)
        total_step = int(hparams.num_epochs * train_steps_per_epoch)
        if current_step < total_step:
            tf.logging.info('Starting training ...')
            image_classifier.train(input_fn=imagenet_train.input_fn,
                                   steps=total_step - current_step)
    else:
        tf.logging.info('Mode not found.')

    if FLAGS.export_dir is not None:
        tf.logging.info('Starting exporting saved model ...')
        serving_shape = [hparams.image_size, hparams.image_size, 3]
        export_path = image_classifier.export_saved_model(
            export_dir_base=FLAGS.export_dir,
            serving_input_receiver_fn=build_image_serving_input_receiver_fn(
                serving_shape),
            as_text=True)
        if FLAGS.add_warmup_requests:
            inference_warmup.write_warmup_requests(
                export_path,
                FLAGS.model_name,
                hparams.image_size,
                batch_sizes=FLAGS.inference_batch_sizes)
Esempio n. 2
0
def main(_):
    mode = FLAGS.mode
    data_dir = FLAGS.data_dir
    model_dir = FLAGS.model_dir
    hparams = build_hparams()

    estimator_parmas = {}

    train_steps_per_epoch = int(
        math.ceil(hparams.num_train_images / float(hparams.train_batch_size)))
    eval_steps = hparams.num_eval_images // hparams.eval_batch_size
    eval_batch_size = (None if mode == 'train' else hparams.eval_batch_size)

    model = slice_model_lib.AmoebaNetEstimatorModel(hparams, model_dir)

    save_checkpoints_steps = (FLAGS.save_checkpoints_steps
                              or FLAGS.iterations_per_loop)
    prepare_tf_config()
    #  rewrite_options = rewriter_config_pb2.RewriterConfig(
    #      layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF)
    #  graph_options = config_pb2.GraphOptions(rewrite_options=rewrite_options)
    session_config = tf.ConfigProto(
        #          graph_options=graph_options,
        allow_soft_placement=True,
        log_device_placement=False,
        gpu_options=tf.GPUOptions(allow_growth=True))
    if FLAGS.cross_pipeline:
        cluster_manager = cluster_utils.get_cluster_manager(
            config_proto=session_config)
    run_config = tf.estimator.RunConfig(
        log_step_count_steps=100,
        session_config=session_config,
        save_checkpoints_steps=save_checkpoints_steps)
    image_classifier = tf.estimator.Estimator(model_fn=model.model_fn,
                                              config=run_config,
                                              params=estimator_parmas)

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    imagenet_train = model_lib.InputPipeline(is_training=True,
                                             data_dir=data_dir,
                                             hparams=hparams)
    imagenet_eval = model_lib.InputPipeline(is_training=False,
                                            data_dir=data_dir,
                                            hparams=hparams)

    if hparams.moving_average_decay < 1:
        eval_hooks = [
            model_lib.LoadEMAHook(model_dir, hparams.moving_average_decay)
        ]
    else:
        eval_hooks = []

    if mode == 'eval':
        for checkpoint in _get_next_checkpoint():
            tf.logging.info('Starting to evaluate.')
            try:
                eval_results = image_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    hooks=eval_hooks,
                    checkpoint_path=checkpoint)
                tf.logging.info('Evaluation results: %s' % eval_results)
            except tf.errors.NotFoundError:
                # skip checkpoint if it gets deleted prior to evaluation
                tf.logging.info('Checkpoint %s no longer exists ... skipping')
    elif mode == 'train_and_eval':
        current_step = _load_global_step_from_checkpoint_dir(model_dir)
        tf.logging.info('Starting training at step=%d.' % current_step)
        train_steps_per_eval = int(hparams.num_epochs_per_eval *
                                   train_steps_per_epoch)
        # Final Evaluation if training is finished.
        if current_step >= hparams.num_epochs * train_steps_per_epoch:
            eval_results = image_classifier.evaluate(
                input_fn=imagenet_eval.input_fn,
                steps=eval_steps,
                hooks=eval_hooks)
            tf.logging.info('Evaluation results: %s' % eval_results)
        while current_step < hparams.num_epochs * train_steps_per_epoch:
            image_classifier.train(input_fn=imagenet_train.input_fn,
                                   steps=train_steps_per_eval)
            current_step += train_steps_per_eval
            tf.logging.info('Starting evaluation at step=%d.' % current_step)
            eval_results = image_classifier.evaluate(
                input_fn=imagenet_eval.input_fn,
                steps=eval_steps,
                hooks=eval_hooks)
            tf.logging.info('Evaluation results: %s' % eval_results)
    elif mode == 'predict':
        for checkpoint in _get_next_checkpoint():
            tf.logging.info('Starting prediction ...')
            time_hook = model_lib.SessionTimingHook()
            eval_hooks.append(time_hook)
            result_iter = image_classifier.predict(
                input_fn=imagenet_eval.input_fn,
                hooks=eval_hooks,
                checkpoint_path=checkpoint,
                yield_single_examples=False)
            results = list(itertools.islice(result_iter, eval_steps))
            tf.logging.info('Inference speed = {} images per second.'.format(
                time_hook.compute_speed(len(results) * eval_batch_size)))
    elif mode == 'train':
        current_step = _load_global_step_from_checkpoint_dir(model_dir)
        total_step = int(hparams.num_epochs * train_steps_per_epoch)
        if current_step < total_step:
            tf.logging.info('Starting training ...')
            image_classifier.train(input_fn=imagenet_train.input_fn,
                                   steps=min(total_step - current_step,
                                             FLAGS.max_steps))
    else:
        tf.logging.info('Mode not found.')