Ejemplo n.º 1
0
def create_session_config(log_device_placement=False,
                          enable_graph_rewriter=False,
                          gpu_mem_fraction=0.95,
                          use_tpu=False,
                          xla_jit_level=tf.OptimizerOptions.OFF,
                          inter_op_parallelism_threads=0,
                          intra_op_parallelism_threads=0):
    """The TensorFlow Session config to use."""
    if use_tpu:
        graph_options = tf.GraphOptions()
    else:
        if enable_graph_rewriter:
            rewrite_options = rewriter_config_pb2.RewriterConfig()
            rewrite_options.layout_optimizer = rewriter_config_pb2.RewriterConfig.ON
            graph_options = tf.GraphOptions(rewrite_options=rewrite_options)
        else:
            graph_options = tf.GraphOptions(
                optimizer_options=tf.OptimizerOptions(
                    opt_level=tf.OptimizerOptions.L1,
                    do_function_inlining=False,
                    global_jit_level=xla_jit_level))

    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=gpu_mem_fraction)

    config = tf.ConfigProto(
        allow_soft_placement=True,
        graph_options=graph_options,
        gpu_options=gpu_options,
        log_device_placement=log_device_placement,
        inter_op_parallelism_threads=inter_op_parallelism_threads,
        intra_op_parallelism_threads=intra_op_parallelism_threads,
        isolate_session_state=True)
    return config
Ejemplo n.º 2
0
def main():
    tf.disable_eager_execution()

    with tf.device('/gpu:0'):
        t1 = tf.random.uniform(shape=[32, 56, 56, 64], dtype=tf.half)
        t2 = tf.random.uniform(shape=[3, 3, 64, 64], dtype=tf.half)
        t = tf.nn.conv2d(input=t1,
                         filters=t2,
                         strides=[2, 2],
                         padding='SAME',
                         data_format='NHWC',
                         name='Conv2D')

    run_options = tf.RunOptions()
    run_options.trace_level = run_options.FULL_TRACE
    run_metadata = tf.RunMetadata()

    options = tf.GraphOptions(build_cost_model=1)
    cfg = tf.ConfigProto(graph_options=options)
    with tf.Session(config=cfg) as sess:
        sess.run(tf.global_variables_initializer())
        _ = sess.run([t], options=run_options, run_metadata=run_metadata)

    for node in run_metadata.cost_graph.node:
        if node.name == 'Conv2D':
            print(node.name, ':', node.compute_cost * 1000, 'ns.')
Ejemplo n.º 3
0
def build_and_export_tpu(model_path, export_model_path, master):
    export_graph = tf.Graph()
    tpu_config = tf.ConfigProto(
        operation_timeout_in_ms=600 * 1000,
        allow_soft_placement=True,
        graph_options=tf.GraphOptions(
            rewrite_options=rewriter_config_pb2.RewriterConfig(
                disable_meta_optimizer=True)),
        isolate_session_state=True)

    export_sess = tf.Session(master, graph=export_graph, config=tpu_config)
    if FLAGS.enable_bf16:
        if tf.io.gfile.exists(export_model_path + '_pre_bf16'):
            tf.io.gfile.rmtree(export_model_path + '_pre_bf16')
        builder = tf.compat.v1.saved_model.Builder(export_model_path +
                                                   '_pre_bf16')
    else:
        if tf.io.gfile.exists(export_model_path):
            tf.io.gfile.rmtree(export_model_path)
        builder = tf.compat.v1.saved_model.Builder(export_model_path)

    with export_graph.as_default():
        features, _ = get_inference_input()
        policy_output, value_output = tpu_call(tf.reshape(features, [-1]))
        tf.train.init_from_checkpoint(model_path, {'/': '/'})
        export_sess.run(tf.initializers.global_variables())
        signature_def_map = {
            tf.saved_model.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
            tf.saved_model.predict_signature_def(inputs={
                'features': features,
            },
                                                 outputs={
                                                     'policy_output':
                                                     policy_output,
                                                     'value_output':
                                                     value_output,
                                                 })
        }
        builder.add_meta_graph_and_variables(
            export_sess,
            tags=[
                tf.saved_model.tag_constants.SERVING,
                tf.saved_model.tag_constants.TPU
            ],
            signature_def_map=signature_def_map)
        tf.logging.info('graph saved.')
    builder.save()
    if FLAGS.enable_bf16:
        tf.logging.info('Convert to BF16')
        options = converter_options_pb2.ConverterOptions()
        options.disable_convert = True
        converter_cli.ConvertSavedModel(export_model_path + '_pre_bf16',
                                        export_model_path,
                                        overwrite=True,
                                        options=options)
        tf.logging.info('BF16 Conversion Complete')
Ejemplo n.º 4
0
def get_session(params, isolate_session_state=True):
    """Builds and returns a `tf.Session`."""
    config = tf.ConfigProto(
        isolate_session_state=isolate_session_state,
        allow_soft_placement=True,
        graph_options=tf.GraphOptions(optimizer_options=tf.OptimizerOptions(
            opt_level=tf.OptimizerOptions.L0,
            do_common_subexpression_elimination=False,
            do_function_inlining=False,
            do_constant_folding=False)))
    return tf.Session(target=params.master, config=config)
    def __init__(self,
                 hparams,
                 train_iterations,
                 eval_steps,
                 per_host_v1=False):
        tf.logging.info("TrainLowLevelRunner: constructor")

        self.feature_structure = {}
        self.eval_feature_structure = {}
        self.loss = None
        self.infeed_queue = []
        self.eval_infeed_queue = []
        self.enqueue_ops = []
        self.eval_enqueue_ops = []
        self.dataset_initializer = []
        self.eval_dataset_initializer = []
        self.is_local = ((hparams.master == "") and (hparams.tpu_name is None))
        self.per_host_v1 = per_host_v1
        self.iterations = train_iterations
        self.eval_steps = eval_steps
        self.outfeed_tensors = []
        self.outfeed_names = []
        self.dequeue_ops = []
        self.predictions = {}
        self.sess = None
        self.graph = tf.Graph()
        self.hparams = hparams
        self.num_hosts = hparams.num_shards // hparams.num_shards_per_host
        with self.graph.as_default():
            self.tpu_init = [tpu.initialize_system()]
            self.tpu_shutdown = tpu.shutdown_system()

        self.resolver = get_resolver(hparams)
        session_config = tf.ConfigProto(
            allow_soft_placement=True,
            isolate_session_state=True,
            operation_timeout_in_ms=600 * 60 * 1000,
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True)))

        if self.hparams.tpu_name is None:
            master = self.hparams.master
        else:
            cluster_spec = self.resolver.cluster_spec()
            if cluster_spec:
                session_config.cluster_def.CopyFrom(
                    cluster_spec.as_cluster_def())
            master = self.resolver.get_master()
        self.sess = tf.Session(master, graph=self.graph, config=session_config)
        self.sess.run(self.tpu_init)
Ejemplo n.º 6
0
def main(unused_argv):

    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        input_image_size = model_builder_factory.get_model_input_size(
            FLAGS.model_name)

    if FLAGS.holdout_shards:
        holdout_images = int(FLAGS.num_train_images * FLAGS.holdout_shards /
                             1024.0)
        FLAGS.num_train_images -= holdout_images
        FLAGS.num_eval_images = holdout_images

    # For imagenet dataset, include background label if number of output classes
    # is 1001
    include_background_label = (FLAGS.num_label_classes == 1001)

    if FLAGS.tpu or FLAGS.use_tpu:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    else:
        tpu_cluster_resolver = None

    if FLAGS.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(100, FLAGS.iterations_per_loop)
    config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.estimator.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long
    # Initializes model parameters.
    params = dict(steps_per_epoch=FLAGS.num_train_images /
                  FLAGS.train_batch_size,
                  use_bfloat16=FLAGS.use_bfloat16)
    est = tf.estimator.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        export_to_tpu=FLAGS.export_to_tpu,
        params=params)

    if (FLAGS.model_name.startswith('efficientnet-lite')
            or FLAGS.model_name.startswith('efficientnet-edgetpu')):
        # lite or edgetpu use binlinear for easier post-quantization.
        resize_method = tf.image.ResizeMethod.BILINEAR
    else:
        resize_method = None
    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    def build_imagenet_input(is_training):
        """Generate ImageNetInput for training and eval."""
        if FLAGS.bigtable_instance:
            logging.info('Using Bigtable dataset, table %s',
                         FLAGS.bigtable_table)
            select_train, select_eval = _select_tables_from_flags()
            return imagenet_input.ImageNetBigtableInput(
                is_training=is_training,
                use_bfloat16=FLAGS.use_bfloat16,
                transpose_input=FLAGS.transpose_input,
                selection=select_train if is_training else select_eval,
                num_label_classes=FLAGS.num_label_classes,
                include_background_label=include_background_label,
                augment_name=FLAGS.augment_name,
                mixup_alpha=FLAGS.mixup_alpha,
                randaug_num_layers=FLAGS.randaug_num_layers,
                randaug_magnitude=FLAGS.randaug_magnitude,
                resize_method=resize_method)
        else:
            if FLAGS.data_dir == FAKE_DATA_DIR:
                logging.info('Using fake dataset.')
            else:
                logging.info('Using dataset: %s', FLAGS.data_dir)

            return imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=FLAGS.transpose_input,
                cache=FLAGS.use_cache and is_training,
                image_size=input_image_size,
                num_parallel_calls=FLAGS.num_parallel_calls,
                use_bfloat16=FLAGS.use_bfloat16,
                num_label_classes=FLAGS.num_label_classes,
                include_background_label=include_background_label,
                augment_name=FLAGS.augment_name,
                mixup_alpha=FLAGS.mixup_alpha,
                randaug_num_layers=FLAGS.randaug_num_layers,
                randaug_magnitude=FLAGS.randaug_magnitude,
                resize_method=resize_method,
                holdout_shards=FLAGS.holdout_shards)

    imagenet_train = build_imagenet_input(is_training=True)
    imagenet_eval = build_imagenet_input(is_training=False)

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(FLAGS.model_dir,
                                                  timeout=FLAGS.eval_timeout):
            logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = est.evaluate(input_fn=imagenet_eval.input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=ckpt,
                                            name=FLAGS.eval_name)
                elapsed_time = int(time.time() - start_timestamp)
                logging.info('Eval results: %s. Elapsed seconds: %d',
                             eval_results, elapsed_time)
                if FLAGS.archive_ckpt:
                    utils.archive_ckpt(eval_results,
                                       eval_results['top_1_accuracy'], ckpt)

                # Terminate eval job when final checkpoint is reached
                try:
                    current_step = int(os.path.basename(ckpt).split('-')[1])
                except IndexError:
                    logging.info('%s has no global step info: stop!', ckpt)
                    break

                if current_step >= FLAGS.train_steps:
                    logging.info('Evaluation finished after training step %d',
                                 current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

        logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / params['steps_per_epoch'], current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if FLAGS.use_async_checkpointing:
                try:
                    from tensorflow.contrib.tpu.python.tpu import async_checkpoint  # pylint: disable=g-import-not-at-top
                except ImportError as e:
                    logging.exception(
                        'Async checkpointing is not supported in TensorFlow 2.x'
                    )
                    raise e

                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=FLAGS.model_dir,
                        save_steps=max(100, FLAGS.iterations_per_loop)))
            est.train(input_fn=imagenet_train.input_fn,
                      max_steps=FLAGS.train_steps,
                      hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                est.train(input_fn=imagenet_train.input_fn,
                          max_steps=next_checkpoint)
                current_step = next_checkpoint

                logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                logging.info('Starting to evaluate.')
                eval_results = est.evaluate(input_fn=imagenet_eval.input_fn,
                                            steps=FLAGS.num_eval_images //
                                            FLAGS.eval_batch_size,
                                            name=FLAGS.eval_name)
                logging.info('Eval results at step %d: %s', next_checkpoint,
                             eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                if FLAGS.archive_ckpt:
                    utils.archive_ckpt(eval_results,
                                       eval_results['top_1_accuracy'], ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
    if FLAGS.export_dir:
        export(est, FLAGS.export_dir, input_image_size)
Ejemplo n.º 7
0
def main(unused_argv):
  params = params_dict.ParamsDict(
      resnet_config.RESNET_CFG, resnet_config.RESNET_RESTRICTIONS)
  params = params_dict.override_params_dict(
      params, FLAGS.config_file, is_strict=True)
  params = params_dict.override_params_dict(
      params, FLAGS.params_override, is_strict=True)

  params = flags_to_params.override_params_from_input_flags(params, FLAGS)

  params.validate()
  params.lock()

  tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
      FLAGS.tpu if (FLAGS.tpu or params.use_tpu) else '',
      zone=FLAGS.tpu_zone,
      project=FLAGS.gcp_project)

  if params.use_async_checkpointing:
    save_checkpoints_steps = None
  else:
    save_checkpoints_steps = max(5000, params.iterations_per_loop)
  config = tf.estimator.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=FLAGS.model_dir,
      save_checkpoints_steps=save_checkpoints_steps,
      log_step_count_steps=FLAGS.log_step_count_steps,
      session_config=tf.ConfigProto(
          graph_options=tf.GraphOptions(
              rewrite_options=rewriter_config_pb2.RewriterConfig(
                  disable_meta_optimizer=True))),
      tpu_config=tf.estimator.tpu.TPUConfig(
          iterations_per_loop=params.iterations_per_loop,
          num_shards=params.num_cores,
          per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig
          .PER_HOST_V2))  # pylint: disable=line-too-long

  resnet_classifier = tf.estimator.tpu.TPUEstimator(
      use_tpu=params.use_tpu,
      model_fn=resnet_model_fn,
      config=config,
      params=params.as_dict(),
      train_batch_size=params.train_batch_size,
      eval_batch_size=params.eval_batch_size,
      export_to_tpu=FLAGS.export_to_tpu)

  assert (params.precision == 'bfloat16' or
          params.precision == 'float32'), (
              'Invalid value for precision parameter; '
              'must be bfloat16 or float32.')
  tf.logging.info('Precision: %s', params.precision)
  use_bfloat16 = params.precision == 'bfloat16'

  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  if FLAGS.bigtable_instance:
    tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table)
    select_train, select_eval = _select_tables_from_flags()
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetBigtableInput(  # pylint: disable=g-complex-comprehension
            is_training=is_training,
            use_bfloat16=use_bfloat16,
            transpose_input=params.transpose_input,
            selection=selection,
            augment_name=FLAGS.augment_name,
            randaug_num_layers=FLAGS.randaug_num_layers,
            randaug_magnitude=FLAGS.randaug_magnitude)
        for (is_training, selection) in [(True,
                                          select_train), (False, select_eval)]
    ]
  else:
    if FLAGS.data_dir == FAKE_DATA_DIR:
      tf.logging.info('Using fake dataset.')
    else:
      tf.logging.info('Using dataset: %s', FLAGS.data_dir)
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(  # pylint: disable=g-complex-comprehension
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            transpose_input=params.transpose_input,
            cache=params.use_cache and is_training,
            image_size=params.image_size,
            num_parallel_calls=params.num_parallel_calls,
            include_background_label=(params.num_label_classes == 1001),
            use_bfloat16=use_bfloat16,
            augment_name=FLAGS.augment_name,
            randaug_num_layers=FLAGS.randaug_num_layers,
            randaug_magnitude=FLAGS.randaug_magnitude)
        for is_training in [True, False]
    ]

  steps_per_epoch = params.num_train_images // params.train_batch_size
  eval_steps = params.num_eval_images // params.eval_batch_size

  if FLAGS.mode == 'eval':

    # Run evaluation when there's a new checkpoint
    for ckpt in tf.train.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info('Starting to evaluate.')
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = resnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                        eval_results, elapsed_time)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        if current_step >= params.train_steps:
          tf.logging.info(
              'Evaluation finished after training step %d', current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info(
            'Checkpoint %s no longer exists, skipping checkpoint', ckpt)

  else:   # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    try:
      current_step = tf.train.load_variable(FLAGS.model_dir,
                                            tf.GraphKeys.GLOBAL_STEP)
    except (TypeError, ValueError, tf.errors.NotFoundError):
      current_step = 0
    steps_per_epoch = params.num_train_images // params.train_batch_size
    tf.logging.info('Training for %d steps (%.2f epochs in total). Current'
                    ' step %d.',
                    params.train_steps,
                    params.train_steps / steps_per_epoch,
                    current_step)

    start_timestamp = time.time()  # This time will include compilation time

    if FLAGS.mode == 'train':
      hooks = []
      if params.use_async_checkpointing:
        try:
          from tensorflow.contrib.tpu.python.tpu import async_checkpoint  # pylint: disable=g-import-not-at-top
        except ImportError as e:
          logging.exception(
              'Async checkpointing is not supported in TensorFlow 2.x')
          raise e

        hooks.append(
            async_checkpoint.AsyncCheckpointSaverHook(
                checkpoint_dir=FLAGS.model_dir,
                save_steps=max(5000, params.iterations_per_loop)))
      if FLAGS.profile_every_n_steps > 0:
        hooks.append(
            tpu_profiler_hook.TPUProfilerHook(
                save_steps=FLAGS.profile_every_n_steps,
                output_dir=FLAGS.model_dir, tpu=FLAGS.tpu)
            )
      resnet_classifier.train(
          input_fn=imagenet_train.input_fn,
          max_steps=params.train_steps,
          hooks=hooks)

    else:
      assert FLAGS.mode == 'train_and_eval'
      while current_step < params.train_steps:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              params.train_steps)
        resnet_classifier.train(
            input_fn=imagenet_train.input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                        next_checkpoint, int(time.time() - start_timestamp))

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be excluded modulo the batch size. As long as the batch size is
        # consistent, the evaluated images are also consistent.
        tf.logging.info('Starting to evaluate.')
        eval_results = resnet_classifier.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=params.num_eval_images // params.eval_batch_size)
        tf.logging.info('Eval results at step %d: %s',
                        next_checkpoint, eval_results)

      elapsed_time = int(time.time() - start_timestamp)
      tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                      params.train_steps, elapsed_time)

    if FLAGS.export_dir is not None:
      # The guide to serve a exported TensorFlow model is at:
      #    https://www.tensorflow.org/serving/serving_basic
      tf.logging.info('Starting to export model.')
      export_path = resnet_classifier.export_saved_model(
          export_dir_base=FLAGS.export_dir,
          serving_input_receiver_fn=imagenet_input.image_serving_input_fn)
      if FLAGS.add_warmup_requests:
        inference_warmup.write_warmup_requests(
            export_path,
            FLAGS.model_name,
            params.image_size,
            batch_sizes=FLAGS.inference_batch_sizes,
            image_format='JPEG')
Ejemplo n.º 8
0
def main(unused_argv):
  params = params_dict.ParamsDict(
      mnasnet_config.MNASNET_CFG, mnasnet_config.MNASNET_RESTRICTIONS)
  params = params_dict.override_params_dict(
      params, FLAGS.config_file, is_strict=True)
  params = params_dict.override_params_dict(
      params, FLAGS.params_override, is_strict=True)

  params = flags_to_params.override_params_from_input_flags(params, FLAGS)

  additional_params = {
      'steps_per_epoch': params.num_train_images / params.train_batch_size,
      'quantized_training': FLAGS.quantized_training,
      'add_summaries': FLAGS.add_summaries,
  }

  params = params_dict.override_params_dict(
      params, additional_params, is_strict=False)

  params.validate()
  params.lock()

  if FLAGS.tpu or params.use_tpu:
    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
  else:
    tpu_cluster_resolver = None

  if params.use_async_checkpointing:
    save_checkpoints_steps = None
  else:
    save_checkpoints_steps = max(100, params.iterations_per_loop)

  # Enables automatic outside compilation. Required in order to
  # automatically detect summary ops to run on CPU instead of TPU.
  tf.config.set_soft_device_placement(True)

  config = tf.estimator.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=FLAGS.model_dir,
      save_checkpoints_steps=save_checkpoints_steps,
      log_step_count_steps=FLAGS.log_step_count_steps,
      session_config=tf.ConfigProto(
          graph_options=tf.GraphOptions(
              rewrite_options=rewriter_config_pb2.RewriterConfig(
                  disable_meta_optimizer=True))),
      tpu_config=tf.estimator.tpu.TPUConfig(
          iterations_per_loop=params.iterations_per_loop,
          per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig
          .PER_HOST_V2))  # pylint: disable=line-too-long

  # Validates Flags.
  if params.precision == 'bfloat16' and params.use_keras:
    raise ValueError(
        'Keras layers do not have full support to bfloat16 activation training.'
        ' You have set precision as %s and use_keras as %s' %
        (params.precision, params.use_keras))

  # Initializes model parameters.
  mnasnet_est = tf.estimator.tpu.TPUEstimator(
      use_tpu=params.use_tpu,
      model_fn=build_model_fn,
      config=config,
      train_batch_size=params.train_batch_size,
      eval_batch_size=params.eval_batch_size,
      export_to_tpu=FLAGS.export_to_tpu,
      params=params.as_dict())

  if FLAGS.mode == 'export_only':
    export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize)
    return

  # Input pipelines are slightly different (with regards to shuffling and
  # preprocessing) between training and evaluation.
  if FLAGS.bigtable_instance:
    tf.logging.info('Using Bigtable dataset, table %s', FLAGS.bigtable_table)
    select_train, select_eval = _select_tables_from_flags()
    imagenet_train, imagenet_eval = [imagenet_input.ImageNetBigtableInput(
        is_training=is_training,
        use_bfloat16=False,
        transpose_input=params.transpose_input,
        selection=selection) for (is_training, selection) in
                                     [(True, select_train),
                                      (False, select_eval)]]
  else:
    if FLAGS.data_dir == FAKE_DATA_DIR:
      tf.logging.info('Using fake dataset.')
    else:
      tf.logging.info('Using dataset: %s', FLAGS.data_dir)
    imagenet_train, imagenet_eval = [
        imagenet_input.ImageNetInput(
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            transpose_input=params.transpose_input,
            cache=params.use_cache and is_training,
            image_size=params.input_image_size,
            num_parallel_calls=params.num_parallel_calls,
            use_bfloat16=(params.precision == 'bfloat16')) for is_training in [True, False]
    ]

  if FLAGS.mode == 'eval':
    eval_steps = params.num_eval_images // params.eval_batch_size
    # Run evaluation when there's a new checkpoint
    for ckpt in tf.train.checkpoints_iterator(
        FLAGS.model_dir, timeout=FLAGS.eval_timeout):
      tf.logging.info('Starting to evaluate.')
      try:
        start_timestamp = time.time()  # This time will include compilation time
        eval_results = mnasnet_est.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=eval_steps,
            checkpoint_path=ckpt)
        elapsed_time = int(time.time() - start_timestamp)
        tf.logging.info('Eval results: %s. Elapsed seconds: %d', eval_results,
                        elapsed_time)
        mnas_utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt)

        # Terminate eval job when final checkpoint is reached
        current_step = int(os.path.basename(ckpt).split('-')[1])
        if current_step >= params.train_steps:
          tf.logging.info('Evaluation finished after training step %d',
                          current_step)
          break

      except tf.errors.NotFoundError:
        # Since the coordinator is on a different job than the TPU worker,
        # sometimes the TPU worker does not finish initializing until long after
        # the CPU job tells it to start evaluating. In this case, the checkpoint
        # file could have been deleted already.
        tf.logging.info('Checkpoint %s no longer exists, skipping checkpoint',
                        ckpt)

    if FLAGS.export_dir:
      export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize)
  else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
    try:
      current_step = tf.train.load_variable(FLAGS.model_dir,
                                            tf.GraphKeys.GLOBAL_STEP)
    except (TypeError, ValueError, tf.errors.NotFoundError):
      current_step = 0

    tf.logging.info(
        'Training for %d steps (%.2f epochs in total). Current'
        ' step %d.', params.train_steps,
        params.train_steps / params.steps_per_epoch, current_step)

    start_timestamp = time.time()  # This time will include compilation time

    if FLAGS.mode == 'train':
      hooks = []
      if params.use_async_checkpointing:
        try:
          from tensorflow.contrib.tpu.python.tpu import async_checkpoint  # pylint: disable=g-import-not-at-top
        except ImportError as e:
          logging.exception(
              'Async checkpointing is not supported in TensorFlow 2.x')
          raise e

        hooks.append(
            async_checkpoint.AsyncCheckpointSaverHook(
                checkpoint_dir=FLAGS.model_dir,
                save_steps=max(100, params.iterations_per_loop)))
      mnasnet_est.train(
          input_fn=imagenet_train.input_fn,
          max_steps=params.train_steps,
          hooks=hooks)

    else:
      assert FLAGS.mode == 'train_and_eval'
      while current_step < params.train_steps:
        # Train for up to steps_per_eval number of steps.
        # At the end of training, a checkpoint will be written to --model_dir.
        next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                              params.train_steps)
        mnasnet_est.train(
            input_fn=imagenet_train.input_fn, max_steps=next_checkpoint)
        current_step = next_checkpoint

        tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                        next_checkpoint, int(time.time() - start_timestamp))

        # Evaluate the model on the most recent model in --model_dir.
        # Since evaluation happens in batches of --eval_batch_size, some images
        # may be excluded modulo the batch size. As long as the batch size is
        # consistent, the evaluated images are also consistent.
        tf.logging.info('Starting to evaluate.')
        eval_results = mnasnet_est.evaluate(
            input_fn=imagenet_eval.input_fn,
            steps=params.num_eval_images // params.eval_batch_size)
        tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                        eval_results)
        ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
        mnas_utils.archive_ckpt(eval_results, eval_results['top_1_accuracy'], ckpt)

      elapsed_time = int(time.time() - start_timestamp)
      tf.logging.info('Finished training up to step %d. Elapsed seconds %d.',
                      params.train_steps, elapsed_time)
      if FLAGS.export_dir:
        export(mnasnet_est, FLAGS.export_dir, params, FLAGS.post_quantize)
Ejemplo n.º 9
0
Archivo: train.py Proyecto: yichenj/tpu
def main(unused_argv):
    input_image_size = FLAGS.input_image_size
    if not input_image_size:
        if FLAGS.model_name.startswith('efficientnet'):
            _, _, input_image_size, _ = efficientnet_builder.efficientnet_params(
                FLAGS.model_name)
        else:
            raise ValueError(
                'input_image_size must be set except for EfficientNet')

    config = tf.estimator.RunConfig(
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=FLAGS.keep_checkpoint_max,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(graph_options=tf.GraphOptions(
            rewrite_options=rewriter_config_pb2.RewriterConfig(
                disable_meta_optimizer=True))))
    # Initializes model parameters.
    params = dict(steps_per_epoch=FLAGS.num_train_images /
                  FLAGS.train_batch_size)
    est = tf.estimator.Estimator(model_fn=model_fn,
                                 config=config,
                                 params=params)

    def build_input(is_training):
        """Input for training and eval."""
        tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        return egg_candler_input.EggCandlerInput(
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            train_batch_size=FLAGS.train_batch_size,
            eval_batch_size=FLAGS.eval_batch_size,
            image_size=input_image_size)

    image_train = build_input(is_training=True)
    image_eval = build_input(is_training=False)

    if FLAGS.mode == 'eval':
        eval_steps = FLAGS.num_eval_images // FLAGS.eval_batch_size
        # Run evaluation when there's a new checkpoint
        for ckpt in tf.train.checkpoints_iterator(FLAGS.model_dir,
                                                  timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = est.evaluate(input_fn=image_eval.input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)
                utils.archive_ckpt(eval_results, eval_results['val_accuracy'],
                                   ckpt)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= FLAGS.train_steps:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)
    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long

        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', FLAGS.train_steps,
            FLAGS.train_steps / params['steps_per_epoch'], current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            est.train(input_fn=image_train.input_fn,
                      max_steps=FLAGS.train_steps,
                      hooks=[])
        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < FLAGS.train_steps:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      FLAGS.train_steps)
                est.train(input_fn=image_train.input_fn,
                          max_steps=next_checkpoint,
                          hooks=[])
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = est.evaluate(input_fn=image_eval.input_fn,
                                            steps=FLAGS.num_eval_images //
                                            FLAGS.eval_batch_size)
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)
                ckpt = tf.train.latest_checkpoint(FLAGS.model_dir)
                utils.archive_ckpt(eval_results, eval_results['val_accuracy'],
                                   ckpt)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                FLAGS.train_steps, elapsed_time)
    if FLAGS.export_dir:
        export(est, FLAGS.export_dir, input_image_size)
Ejemplo n.º 10
0
def main(unused_argv):
    params = hyperparameters.get_hyperparameters(FLAGS.default_hparams_file,
                                                 FLAGS.hparams_file, FLAGS,
                                                 FLAGS.hparams)
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu if (FLAGS.tpu or params['use_tpu']) else '',
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)

    if params['use_async_checkpointing']:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(2500, params['iterations_per_loop'])
    config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=get_model_dir(params),
        save_checkpoints_steps=save_checkpoints_steps,
        keep_checkpoint_max=None,  # Keep all checkpoints.
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=params['iterations_per_loop'],
            num_shards=params['num_cores'],
            # copybara:strip_begin
            tpu_job_name=FLAGS.tpu_job_name,
            # copybara:strip_end
            per_host_input_for_training=contrib_tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    resnet_classifier = contrib_tpu.TPUEstimator(
        use_tpu=params['use_tpu'],
        model_fn=resnet_model_fn,
        config=config,
        params=params,
        train_batch_size=params['train_batch_size'],
        eval_batch_size=params['eval_batch_size'],
        export_to_tpu=FLAGS.export_to_tpu)

    # copybara:strip_begin
    if FLAGS.xla_compile:
        resnet_classifier = contrib_tpu.TPUEstimator(
            use_tpu=params['use_tpu'],
            model_fn=xla.estimator_model_fn(resnet_model_fn),
            config=config,
            params=params,
            train_batch_size=params['train_batch_size'],
            eval_batch_size=params['eval_batch_size'],
            export_to_tpu=FLAGS.export_to_tpu)
    # copybara:strip_end
    assert (params['precision'] == 'bfloat16' or params['precision']
            == 'float32'), ('Invalid value for precision parameter; '
                            'must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', params['precision'])
    use_bfloat16 = params['precision'] == 'bfloat16'

    # Input pipelines are slightly different (with regards to shuffling and
    # preprocessing) between training and evaluation.
    if FLAGS.bigtable_instance:
        tf.logging.info('Using Bigtable dataset, table %s',
                        FLAGS.bigtable_table)
        select_train, select_eval = _select_tables_from_flags()
        imagenet_train = imagenet_input.ImageNetBigtableInput(
            is_training=True,
            use_bfloat16=use_bfloat16,
            transpose_input=params['transpose_input'],
            selection=select_train)
        imagenet_eval = imagenet_input.ImageNetBigtableInput(
            is_training=False,
            use_bfloat16=use_bfloat16,
            transpose_input=params['transpose_input'],
            selection=select_eval)
    else:
        if FLAGS.data_dir == FAKE_DATA_DIR:
            tf.logging.info('Using fake dataset.')
        else:
            tf.logging.info('Using dataset: %s', FLAGS.data_dir)
        imagenet_train, imagenet_eval = [
            imagenet_input.ImageNetInput(
                is_training=is_training,
                data_dir=FLAGS.data_dir,
                transpose_input=params['transpose_input'],
                cache=params['use_cache'] and is_training,
                image_size=params['image_size'],
                num_parallel_calls=params['num_parallel_calls'],
                use_bfloat16=use_bfloat16) for is_training in [True, False]
        ]

    steps_per_epoch = params['num_train_images'] // params['train_batch_size']
    eval_steps = params['num_eval_images'] // params['eval_batch_size']

    if FLAGS.mode == 'eval':

        # Run evaluation when there's a new checkpoint
        for ckpt in evaluation.checkpoints_iterator(
                get_model_dir(params), timeout=FLAGS.eval_timeout):
            tf.logging.info('Starting to evaluate.')
            try:
                start_timestamp = time.time(
                )  # This time will include compilation time
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=eval_steps,
                    checkpoint_path=ckpt)
                elapsed_time = int(time.time() - start_timestamp)
                tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                                eval_results, elapsed_time)

                # Terminate eval job when final checkpoint is reached
                current_step = int(os.path.basename(ckpt).split('-')[1])
                if current_step >= params['train_steps']:
                    tf.logging.info(
                        'Evaluation finished after training step %d',
                        current_step)
                    break

            except tf.errors.NotFoundError:
                # Since the coordinator is on a different job than the TPU worker,
                # sometimes the TPU worker does not finish initializing until long after
                # the CPU job tells it to start evaluating. In this case, the checkpoint
                # file could have been deleted already.
                tf.logging.info(
                    'Checkpoint %s no longer exists, skipping checkpoint',
                    ckpt)

    elif FLAGS.mode == 'eval_igt':
        # IGT evaluation mode. Evaluate metrics for the desired parameters
        # (true or shifted) on the desired dataset (train or eval). Note that
        # train is still with data augmentation.

        # Get checkpoint file names.
        index_files = tf.gfile.Glob(
            os.path.join(get_model_dir(params), 'model.ckpt-*.index'))
        checkpoints = [fn[:-len('.index')] for fn in index_files]
        # Need to sort them to get proper tensorboard plotting (increasing event
        # timestamps correspond to increasing steps).
        checkpoint_steps = []
        for ckpt in checkpoints:
            tf.logging.info(ckpt)
            step_match = re.match(r'.*model.ckpt-([0-9]*)', ckpt)
            checkpoint_steps.append(int(step_match.group(1)))
        checkpoints = [
            ckpt for _, ckpt in sorted(zip(checkpoint_steps, checkpoints))
        ]
        tf.logging.info('There are {} checkpoints'.format(len(checkpoints)))
        tf.logging.info(', '.join(checkpoints))

        # Keep track of the last processed checkpoint (fault tolerance).
        analysis_state_path = os.path.join(
            get_model_dir(params),
            'analysis_state_' + FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode)
        next_analysis_index = 0
        if tf.gfile.Exists(analysis_state_path):
            with tf.gfile.Open(analysis_state_path) as fd:
                next_analysis_index = int(fd.read())

        # Process each checkpoint.
        while next_analysis_index < len(checkpoints):
            tf.logging.info(
                'Next analysis index: {}'.format(next_analysis_index))
            ckpt_path = checkpoints[next_analysis_index]
            tf.logging.info('Starting to evaluate: {}.'.format(ckpt_path))
            start_timestamp = time.time(
            )  # This time will include compilation time

            if FLAGS.igt_eval_set == 'train':
                the_input_fn = imagenet_train.input_fn
                the_steps = steps_per_epoch
            elif FLAGS.igt_eval_set == 'eval':
                the_input_fn = imagenet_eval.input_fn
                the_steps = eval_steps
            else:
                raise ValueError('Unsupported igt_eval_set')

            eval_results = resnet_classifier.evaluate(
                input_fn=the_input_fn,
                steps=the_steps,
                checkpoint_path=ckpt_path,
                name=FLAGS.igt_eval_set + '_' + FLAGS.igt_eval_mode)
            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info('Eval results: %s. Elapsed seconds: %d',
                            eval_results, elapsed_time)

            next_analysis_index += 1
            file_io.atomic_write_string_to_file(analysis_state_path,
                                                str(next_analysis_index))

    else:  # FLAGS.mode == 'train' or FLAGS.mode == 'train_and_eval'
        current_step = estimator._load_global_step_from_checkpoint_dir(
            get_model_dir(params))  # pylint:disable=protected-access,g-line-too-long
        steps_per_epoch = params['num_train_images'] // params[
            'train_batch_size']
        tf.logging.info(
            'Training for %d steps (%.2f epochs in total). Current'
            ' step %d.', params['train_steps'],
            params['train_steps'] / steps_per_epoch, current_step)

        start_timestamp = time.time(
        )  # This time will include compilation time

        if FLAGS.mode == 'train':
            hooks = []
            if params['use_async_checkpointing']:
                hooks.append(
                    async_checkpoint.AsyncCheckpointSaverHook(
                        checkpoint_dir=get_model_dir(params),
                        save_steps=max(2500, params['iterations_per_loop'])))
            resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                    max_steps=params['train_steps'],
                                    hooks=hooks)

        else:
            assert FLAGS.mode == 'train_and_eval'
            while current_step < params['train_steps']:
                # Train for up to steps_per_eval number of steps.
                # At the end of training, a checkpoint will be written to --model_dir.
                next_checkpoint = min(current_step + FLAGS.steps_per_eval,
                                      params['train_steps'])
                resnet_classifier.train(input_fn=imagenet_train.input_fn,
                                        max_steps=next_checkpoint)
                current_step = next_checkpoint

                tf.logging.info(
                    'Finished training up to step %d. Elapsed seconds %d.',
                    next_checkpoint, int(time.time() - start_timestamp))

                # Evaluate the model on the most recent model in --model_dir.
                # Since evaluation happens in batches of --eval_batch_size, some images
                # may be excluded modulo the batch size. As long as the batch size is
                # consistent, the evaluated images are also consistent.
                tf.logging.info('Starting to evaluate.')
                eval_results = resnet_classifier.evaluate(
                    input_fn=imagenet_eval.input_fn,
                    steps=params['num_eval_images'] //
                    params['eval_batch_size'])
                tf.logging.info('Eval results at step %d: %s', next_checkpoint,
                                eval_results)

            elapsed_time = int(time.time() - start_timestamp)
            tf.logging.info(
                'Finished training up to step %d. Elapsed seconds %d.',
                params['train_steps'], elapsed_time)

        if FLAGS.export_dir is not None:
            # The guide to serve a exported TensorFlow model is at:
            #    https://www.tensorflow.org/serving/serving_basic
            tf.logging.info('Starting to export model.')
            unused_export_path = resnet_classifier.export_saved_model(
                export_dir_base=FLAGS.export_dir,
                serving_input_receiver_fn=imagenet_input.image_serving_input_fn
            )
Ejemplo n.º 11
0
    def test_sample_auxiliary_op(self):
        p_fn, q_fn = sampling.mean_field_fn()
        p = p_fn(tf.float32, (), 'test_prior', True,
                 tf.get_variable).distribution
        q = q_fn(tf.float32, (), 'test_posterior', True,
                 tf.get_variable).distribution

        # Test benign auxiliary variable
        sample_op, _ = sampling.sample_auxiliary_op(p, q, 1e-10)
        session_config = tf.ConfigProto(graph_options=tf.GraphOptions(
            rewrite_options=rewriter_config_pb2.RewriterConfig(
                arithmetic_optimization=rewriter_config_pb2.RewriterConfig.
                AGGRESSIVE)))

        sess = tf.Session(config=session_config)
        sess.run(tf.initialize_all_variables())
        p.loc.load(1., session=sess)
        p.untransformed_scale.load(self._softplus_inverse_np(1.), session=sess)
        q.loc.load(1.1, session=sess)
        q.untransformed_scale.load(self._softplus_inverse_np(0.5),
                                   session=sess)
        print(sess.run(q.scale))

        sess.run(sample_op)

        tolerance = 0.0001
        self.assertLess(np.abs(sess.run(p.scale) - 1.), tolerance)
        self.assertLess(np.abs(sess.run(p.loc) - 1.), tolerance)
        self.assertLess(np.abs(sess.run(q.scale) - 0.5), tolerance)
        self.assertLess(np.abs(sess.run(q.loc) - 1.1), tolerance)

        # Test fully determining auxiliary variable
        sample_op, _ = sampling.sample_auxiliary_op(p, q, 1. - 1e-10)
        sess.run(tf.initialize_all_variables())
        p.loc.load(1., session=sess)
        p.untransformed_scale.load(self._softplus_inverse_np(1.), session=sess)
        q.loc.load(1.1, session=sess)
        q.untransformed_scale.load(self._softplus_inverse_np(.5), session=sess)

        sess.run(sample_op)

        self.assertLess(np.abs(sess.run(q.loc) - sess.run(p.loc)), tolerance)
        self.assertLess(sess.run(p.scale), tolerance)
        self.assertLess(sess.run(q.scale), tolerance)

        # Test delta posterior
        sample_op, _ = sampling.sample_auxiliary_op(p, q, 0.5)
        sess.run(tf.initialize_all_variables())
        p.loc.load(1., session=sess)
        p.untransformed_scale.load(self._softplus_inverse_np(1.), session=sess)
        q.loc.load(1.1, session=sess)
        q.untransformed_scale.load(self._softplus_inverse_np(1e-10),
                                   session=sess)

        sess.run(sample_op)

        self.assertLess(np.abs(sess.run(q.loc) - 1.1), tolerance)
        self.assertLess(sess.run(q.scale), tolerance)

        # Test prior is posterior
        sample_op, _ = sampling.sample_auxiliary_op(p, q, 0.5)
        sess.run(tf.initialize_all_variables())
        p.loc.load(1., session=sess)
        p.untransformed_scale.load(self._softplus_inverse_np(1.), session=sess)
        q.loc.load(1., session=sess)
        q.untransformed_scale.load(self._softplus_inverse_np(1.), session=sess)

        sess.run(sample_op)

        self.assertLess(np.abs(sess.run(q.loc - p.loc)), tolerance)
        self.assertLess(np.abs(sess.run(q.scale - p.scale)), tolerance)
Ejemplo n.º 12
0
    def __init__(self,
                 iterations_per_loop,
                 train_steps,
                 eval_steps,
                 num_replicas,
                 eval_dataset_repeats=True,
                 do_initialize=True):
        self.feature_structure = {}
        self.infeed_op = {}
        self.num_replicas = num_replicas
        self.eval_dataset_repeats = eval_dataset_repeats
        # Set number of input graphs to number of hosts up to a maximum of 32.
        self.num_input_graphs = min(
            32, self.num_replicas // FLAGS.replicas_per_host)
        # Following data has separated copies for training and eval, thus
        # represented as a map from is_train(boolean) to actual data
        self.dataset_initializer = {True: [], False: []}
        self.input_graph = {True: [], False: []}
        self.input_sess = {True: [], False: []}
        self.enqueue_ops = {True: [], False: []}
        for _ in range(self.num_input_graphs):
            self.input_graph[True].append(tf.Graph())
            self.input_graph[False].append(tf.Graph())
            self.dataset_initializer[True].append([])
            self.dataset_initializer[False].append([])
            self.enqueue_ops[True].append([])
            self.enqueue_ops[False].append([])
            self.input_sess[True].append([])
            self.input_sess[False].append([])
        # dequeue_ops is only for eval
        self.dequeue_ops = []
        self.iterations_per_loop = iterations_per_loop
        self.sess = None
        self.output_sess = None
        self.train_eval_thread = None
        self.graph = tf.Graph()
        if iterations_per_loop != 0 and train_steps % iterations_per_loop != 0:
            train_steps = iterations_per_loop * int(
                math.ceil(train_steps / iterations_per_loop))
        self.train_steps = train_steps
        if iterations_per_loop == 0:
            self.max_train_iterations = 1
        else:
            self.max_train_iterations = train_steps // iterations_per_loop
        self.eval_steps = int(eval_steps)
        self.train_batch_size = 0
        self.eval_batch_size = 0
        self.eval_has_labels = 0
        self.model_fn = None
        self.num_outfeeds = self.eval_steps
        self.config = tf.ConfigProto(
            operation_timeout_in_ms=600 * 60 * 1000,
            allow_soft_placement=True,
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True)),
            isolate_session_state=True)

        if FLAGS.enable_mlir_bridge:
            self.config.experimental.enable_mlir_bridge = True

        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.master,
            zone=FLAGS.tpu_zone,
            project=FLAGS.gcp_project,
            job_name="tpu_worker")
        self.master = tpu_cluster_resolver.get_master()
        self.job_name = tpu_cluster_resolver.get_job_name() or "tpu_worker"
        self.embedding_config = None
        self.device_topology = None
        if do_initialize:
            self.device_topology = tf.Session(
                self.master, config=self.config).run(tpu.initialize_system())
Ejemplo n.º 13
0
def main():
    # current camera frame
    global frame, annotatedFrame, frameQueue, currentFps, selectedIdx, selectedClassName, objectDistance, boxes, scores, stats
    global currentMode, M_AUTOMANEUVER, M_AUTONAV, M_MANUAL

    # print(cv2.getBuildInformation())
    print("Loading model")
    detection_graph = tf.Graph()
    with detection_graph.as_default():
        od_graph_def = tf.GraphDef()
        with tf.gfile.GFile(CHKPT_PATH, 'rb') as fid:
            serialized_graph = fid.read()
            od_graph_def.ParseFromString(serialized_graph)
            tf.import_graph_def(od_graph_def, name='')

    label_map = label_map_util.load_labelmap(LABELS_PATH)
    categories = label_map_util.convert_label_map_to_categories(
        label_map, max_num_classes=2, use_display_name=True)
    category_index = label_map_util.create_category_index(categories)

    print("Starting main python module")
    if not DEBUG_DISABLE_FLIGHT:
        flightData = Drone(updateFlightInfo)
        process = Thread(target=flight.flightMain, args=(flightData, ))
        process.start()
    ip = '0.0.0.0'
    server = ThreadedHTTPServer((ip, 9090), CamHandler)
    target = Thread(target=server.serve_forever, args=())
    i = 0

    # To flip the image, modify the flip_method parameter (0 and 2 are the most common)
    #print(gstreamer_pipeline(flip_method=0))
    cap = cv2.VideoCapture(gstreamer_pipeline(flip_method=2),
                           cv2.CAP_GSTREAMER)
    fpsSmoothing = 70
    lastUpdate = time.time()
    try:
        if cap.isOpened():
            print("CSI Camera opened")
            graph_options = tf.GraphOptions(
                optimizer_options=tf.OptimizerOptions(
                    opt_level=tf.OptimizerOptions.L1, ))
            OptConfig = tf.ConfigProto(graph_options=graph_options)
            with detection_graph.as_default():
                with tf.Session(graph=detection_graph,
                                config=OptConfig) as sess:
                    # Definite input and output Tensors for detection_graph
                    image_tensor = detection_graph.get_tensor_by_name(
                        'image_tensor:0')
                    # Each box represents a part of the image where a particular object
                    # was detected.
                    detection_boxes = detection_graph.get_tensor_by_name(
                        'detection_boxes:0')
                    # Each score represent how level of confidence for each of the objects.
                    # Score is shown on the result image, together with the class
                    # label.
                    detection_scores = detection_graph.get_tensor_by_name(
                        'detection_scores:0')
                    detection_classes = detection_graph.get_tensor_by_name(
                        'detection_classes:0')
                    num_detections = detection_graph.get_tensor_by_name(
                        'num_detections:0')
                    i = 0
                    print("TensorFlow session loaded.")
                    while mainThreadRunning:
                        ret_val, img = cap.read()
                        frame = img
                        # convert OpenCV's BGR to RGB as the model
                        # was trained on RGB images
                        color_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                        # resize image to model size of 360x270
                        color_frame = cv2.resize(color_frame, (360, 270),
                                                 interpolation=cv2.INTER_CUBIC)
                        image_np_expanded = np.expand_dims(color_frame, axis=0)
                        # Actual detection
                        (boxes, scores, classes, num) = sess.run(
                            [
                                detection_boxes, detection_scores,
                                detection_classes, num_detections
                            ],
                            feed_dict={image_tensor: image_np_expanded})

                        # Draw boxes using TF library, should be off during competition
                        if useBoxVisualization:
                            vis_util.visualize_boxes_and_labels_on_image_array(
                                frame,
                                np.squeeze(boxes),
                                np.squeeze(classes).astype(np.int32),
                                np.squeeze(scores),
                                category_index,
                                use_normalized_coordinates=True,
                                line_thickness=4,
                                min_score_thresh=MIN_CONFIDENCE)

                        # Now that we have the detected BBoxes, it's time to determine our current obstacle
                        # First, gather stats about the bounding boxes
                        # squeezing makes it so you can do access box[i] directly instead of having to
                        # access box[0][i]
                        boxes = np.squeeze(boxes)
                        classes = np.squeeze(classes)
                        scores = np.squeeze(scores)
                        stats = []
                        j = 0
                        # This is 15ft, any object farther than that is a misidentification
                        lowestDistance = 15

                        if DEBUG_DUMP_DETECTIONS:
                            print("Boxes // Classes // Scores")
                            print(boxes)
                            print(classes)
                            print(scores)
                        # Reset selections
                        selectedIdx = None
                        if len(boxes) > 0:
                            for j in range(0, len(boxes)):
                                if scores[j] >= MIN_CONFIDENCE:
                                    stats.insert(
                                        j, getBoxStats(boxes[j], classes[j]))
                                    # print("box[%d] distance is %f" % (j, stats[j]['distance']))
                                    if stats[j]['distance'] < lowestDistance:
                                        selectedIdx = j
                                        selectedClassName = classToString(
                                            classes[j])
                                        objectDistance = stats[j]['distance']
                                        lowestDistance = objectDistance
                                        #print("Selected box[%d]: distance %f class %s conf %f" % (j, objectDistance, selectedClassName, scores[j]))
                                else:
                                    # Skip calculations on this box if it does not meet
                                    # confidence threshold
                                    stats.insert(j, 0)
                        if not DEBUG_DISABLE_FLIGHT:
                            if selectedIdx is not None:
                                flightData.upData(stats[selectedIdx],
                                                  selectedClassName)
                            else:
                                flightData.upData(None, "None")

                        # add the HUD to the current image
                        annotatedFrame = applyHud()
                        # currentFrameTime = time.time()
                        #if frameQueue.full():
                        #    with frameQueue.mutex:
                        #        frameQueue.queue.clear()
                        frameQueue.put(annotatedFrame.copy())
                        if i == 0:
                            target.start()
                            print("Starting MJPEG stream")
                        i += 1
                        # FPS smoothing algorithm
                        frameTime = time.time() - lastUpdate
                        frameFps = 1 / frameTime
                        currentFps += (frameFps - currentFps) / fpsSmoothing
                        lastUpdate = time.time()

                    cap.release()
        else:
            print("FATAL: Unable to open camera")

    except KeyboardInterrupt:
        sys.exit()