def test_checkpoint_max_to_keep(self):
        """Test that only the most recent checkpoints are kept."""

        with mock.patch.object(model_builder, 'build',
                               autospec=True) as mock_builder:
            mock_builder.return_value = SimpleModel()

            hparams = model_hparams.create_hparams(
                hparams_overrides='load_pretrained=false')
            pipeline_config_path = get_pipeline_config_path(
                MODEL_NAME_FOR_TEST)
            config_kwarg_overrides = _get_config_kwarg_overrides()
            model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())

            model_lib_v2.train_loop(hparams,
                                    pipeline_config_path,
                                    model_dir=model_dir,
                                    train_steps=20,
                                    checkpoint_every_n=2,
                                    checkpoint_max_to_keep=3,
                                    **config_kwarg_overrides)
            ckpt_files = tf.io.gfile.glob(
                os.path.join(model_dir, 'ckpt-*.index'))
            self.assertEqual(len(ckpt_files), 3,
                             '{} not of length 3.'.format(ckpt_files))
Esempio n. 2
0
def main(unused_argv):
  flags.mark_flag_as_required('model_dir')
  flags.mark_flag_as_required('pipeline_config_path')
  tf.config.set_soft_device_placement(True)

  if FLAGS.checkpoint_dir:
    model_lib_v2.eval_continuously(
        pipeline_config_path=FLAGS.pipeline_config_path,
        model_dir=FLAGS.model_dir,
        train_steps=FLAGS.num_train_steps,
        sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
        sample_1_of_n_eval_on_train_examples=(
            FLAGS.sample_1_of_n_eval_on_train_examples),
        checkpoint_dir=FLAGS.checkpoint_dir,
        wait_interval=300, timeout=FLAGS.eval_timeout)
  else:
    if FLAGS.use_tpu:
      resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
      tf.config.experimental_connect_to_cluster(resolver)
      tf.tpu.experimental.initialize_tpu_system(resolver)
      strategy = tf.distribute.experimental.TPUStrategy(resolver)
    elif FLAGS.num_workers > 1:
      strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
    else:
      strategy = tf.compat.v2.distribute.MirroredStrategy()

    with strategy.scope():
      model_lib_v2.train_loop(
          pipeline_config_path=FLAGS.pipeline_config_path,
          model_dir=FLAGS.model_dir,
          train_steps=FLAGS.num_train_steps,
          use_tpu=FLAGS.use_tpu)
  def test_train_loop_then_eval_loop(self):
    """Tests that Estimator and input function are constructed correctly."""
    hparams = model_hparams.create_hparams(
        hparams_overrides='load_pretrained=false')
    pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
    config_kwarg_overrides = _get_config_kwarg_overrides()
    model_dir = tf.test.get_temp_dir()

    train_steps = 2
    model_lib_v2.train_loop(
        hparams,
        pipeline_config_path,
        model_dir=model_dir,
        train_steps=train_steps,
        checkpoint_every_n=1,
        **config_kwarg_overrides)

    model_lib_v2.eval_continuously(
        hparams,
        pipeline_config_path,
        model_dir=model_dir,
        checkpoint_dir=model_dir,
        train_steps=train_steps,
        wait_interval=10,
        **config_kwarg_overrides)
    def test_checkpoint_max_to_keep(self):
        """Test that only the most recent checkpoints are kept."""

        strategy = tf2.distribute.OneDeviceStrategy(device='/cpu:0')
        with mock.patch.object(model_builder, 'build',
                               autospec=True) as mock_builder:
            with strategy.scope():
                mock_builder.return_value = SimpleModel()
            model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
            pipeline_config_path = get_pipeline_config_path(
                MODEL_NAME_FOR_TEST)
            new_pipeline_config_path = os.path.join(model_dir,
                                                    'new_pipeline.config')
            config_util.clear_fine_tune_checkpoint(pipeline_config_path,
                                                   new_pipeline_config_path)
            config_kwarg_overrides = _get_config_kwarg_overrides()

            with strategy.scope():
                model_lib_v2.train_loop(new_pipeline_config_path,
                                        model_dir=model_dir,
                                        train_steps=20,
                                        checkpoint_every_n=2,
                                        checkpoint_max_to_keep=3,
                                        **config_kwarg_overrides)
            ckpt_files = tf.io.gfile.glob(
                os.path.join(model_dir, 'ckpt-*.index'))
            self.assertEqual(len(ckpt_files), 3,
                             '{} not of length 3.'.format(ckpt_files))
    def test_train_loop_then_eval_loop(self):
        """Tests that Estimator and input function are constructed correctly."""
        model_dir = tf.test.get_temp_dir()
        pipeline_config_path = get_pipeline_config_path(MODEL_NAME_FOR_TEST)
        new_pipeline_config_path = os.path.join(model_dir,
                                                'new_pipeline.config')
        config_util.clear_fine_tune_checkpoint(pipeline_config_path,
                                               new_pipeline_config_path)
        config_kwarg_overrides = _get_config_kwarg_overrides()

        train_steps = 2
        strategy = tf2.distribute.MirroredStrategy(['/cpu:0', '/cpu:1'])
        with strategy.scope():
            model_lib_v2.train_loop(new_pipeline_config_path,
                                    model_dir=model_dir,
                                    train_steps=train_steps,
                                    checkpoint_every_n=1,
                                    **config_kwarg_overrides)

        model_lib_v2.eval_continuously(new_pipeline_config_path,
                                       model_dir=model_dir,
                                       checkpoint_dir=model_dir,
                                       train_steps=train_steps,
                                       wait_interval=1,
                                       timeout=10,
                                       **config_kwarg_overrides)
Esempio n. 6
0
    def test_export_metrics_json_serializable(self):
        """Tests that Estimator and input function are constructed correctly."""

        strategy = tf2.distribute.OneDeviceStrategy(device='/cpu:0')

        def export(data, _):
            json.dumps(data)

        with mock.patch.dict(exporter_lib_v2.INPUT_BUILDER_UTIL_MAP,
                             FAKE_BUILDER_MAP):
            with strategy.scope():
                model_dir = tf.test.get_temp_dir()
                new_pipeline_config_path = os.path.join(
                    model_dir, 'new_pipeline.config')
                pipeline_config_path = get_pipeline_config_path(
                    MODEL_NAME_FOR_TEST)
                config_util.clear_fine_tune_checkpoint(
                    pipeline_config_path, new_pipeline_config_path)
                train_steps = 2
                with strategy.scope():
                    model_lib_v2.train_loop(
                        new_pipeline_config_path,
                        model_dir=model_dir,
                        train_steps=train_steps,
                        checkpoint_every_n=100,
                        performance_summary_exporter=export,
                        **_get_config_kwarg_overrides())
Esempio n. 7
0
    def test_checkpoint_max_to_keep(self):
        """Test that only the most recent checkpoints are kept."""

        strategy = tf2.distribute.OneDeviceStrategy(device='/cpu:0')
        with mock.patch.dict(exporter_lib_v2.INPUT_BUILDER_UTIL_MAP,
                             FAKE_BUILDER_MAP):

            model_dir = tempfile.mkdtemp(dir=self.get_temp_dir())
            pipeline_config_path = get_pipeline_config_path(
                MODEL_NAME_FOR_TEST)
            new_pipeline_config_path = os.path.join(model_dir,
                                                    'new_pipeline.config')
            config_util.clear_fine_tune_checkpoint(pipeline_config_path,
                                                   new_pipeline_config_path)
            config_kwarg_overrides = _get_config_kwarg_overrides()

            with strategy.scope():
                model_lib_v2.train_loop(new_pipeline_config_path,
                                        model_dir=model_dir,
                                        train_steps=20,
                                        checkpoint_every_n=2,
                                        checkpoint_max_to_keep=3,
                                        **config_kwarg_overrides)
            ckpt_files = tf.io.gfile.glob(
                os.path.join(model_dir, 'ckpt-*.index'))
            self.assertEqual(len(ckpt_files), 3,
                             '{} not of length 3.'.format(ckpt_files))
def main(unused_argv):

    # ste the gpu (device:GPU:0)
    print("Num GPUs Available: ",
          len(tf.config.experimental.list_physical_devices('GPU')))
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        # Restrict TensorFlow to only use the first GPU
        try:
            tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
            tf.config.experimental.set_memory_growth(gpus[0], True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus),
                  "Logical GPU")
        except RuntimeError as e:
            # Visible devices must be set before GPUs have been initialized
            print(e)

    flags.mark_flag_as_required('model_dir')
    flags.mark_flag_as_required('pipeline_config_path')
    tf.config.set_soft_device_placement(True)

    if FLAGS.checkpoint_dir:
        model_lib_v2.eval_continuously(
            pipeline_config_path=FLAGS.pipeline_config_path,
            model_dir=FLAGS.model_dir,
            train_steps=FLAGS.num_train_steps,
            sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
            sample_1_of_n_eval_on_train_examples=(
                FLAGS.sample_1_of_n_eval_on_train_examples),
            checkpoint_dir=FLAGS.checkpoint_dir,
            wait_interval=300,
            timeout=FLAGS.eval_timeout)
    else:
        if FLAGS.use_tpu:
            # TPU is automatically inferred if tpu_name is None and
            # we are running under cloud ai-platform.
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                FLAGS.tpu_name)
            tf.config.experimental_connect_to_cluster(resolver)
            tf.tpu.experimental.initialize_tpu_system(resolver)
            strategy = tf.distribute.experimental.TPUStrategy(resolver)
        # elif FLAGS.num_workers > 1:
        #   strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
        # else:
        #   strategy = tf.compat.v2.distribute.MirroredStrategy()

        # with strategy.scope():
        model_lib_v2.train_loop(
            pipeline_config_path=FLAGS.pipeline_config_path,
            model_dir=FLAGS.model_dir,
            train_steps=FLAGS.num_train_steps,
            use_tpu=FLAGS.use_tpu,
            checkpoint_every_n=FLAGS.checkpoint_every_n,
            record_summaries=FLAGS.record_summaries)
Esempio n. 9
0
def main(unused_argv):
    flags.mark_flag_as_required('model_dir')
    flags.mark_flag_as_required('pipeline_config_path')
    tf.config.set_soft_device_placement(True)

    if FLAGS.checkpoint_dir:
        if FLAGS.eval_all_checkpoints:
            model_lib_v2.eval_all_checkpoints(
                pipeline_config_path=FLAGS.pipeline_config_path,
                model_dir=FLAGS.model_dir,
                train_steps=FLAGS.num_train_steps,
                sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
                sample_1_of_n_eval_on_train_examples=(
                    FLAGS.sample_1_of_n_eval_on_train_examples),
                checkpoint_dir=FLAGS.checkpoint_dir)
        else:
            model_lib_v2.eval_continuously(
                pipeline_config_path=FLAGS.pipeline_config_path,
                model_dir=FLAGS.model_dir,
                train_steps=FLAGS.num_train_steps,
                sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
                sample_1_of_n_eval_on_train_examples=(
                    FLAGS.sample_1_of_n_eval_on_train_examples),
                checkpoint_dir=FLAGS.checkpoint_dir,
                wait_interval=300,
                timeout=FLAGS.eval_timeout)
    else:
        if FLAGS.use_tpu:
            # TPU is automatically inferred if tpu_name is None and
            # we are running under cloud ai-platform.
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                FLAGS.tpu_name)
            tf.config.experimental_connect_to_cluster(resolver)
            tf.tpu.experimental.initialize_tpu_system(resolver)
            strategy = tf.distribute.experimental.TPUStrategy(resolver)
        elif FLAGS.num_workers > 1:
            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
        else:
            strategy = tf.compat.v2.distribute.MirroredStrategy()

        with strategy.scope():
            model_lib_v2.train_loop(
                pipeline_config_path=FLAGS.pipeline_config_path,
                model_dir=FLAGS.model_dir,
                train_steps=FLAGS.num_train_steps,
                use_tpu=FLAGS.use_tpu,
                checkpoint_every_n=FLAGS.checkpoint_every_n,
                record_summaries=FLAGS.record_summaries)
Esempio n. 10
0
    def train(self, steps_per_epoch, checkpoint_every_n_epochs=10):
        """
            Trains the model.
        Args:
            steps_per_epoch: Number of steps that are to be trained for one epoch
            checkpoint_every_n_epochs: Epoch interval in which to save a checkpoint while training
        """
        checkpoints_every_n_steps = steps_per_epoch * checkpoint_every_n_epochs

        strategy = tf.compat.v2.distribute.MirroredStrategy()
        with strategy.scope():
            model_lib_v2.train_loop(
                pipeline_config_path=self.config_path,
                model_dir=self.checkpoint_path,
                checkpoint_every_n=checkpoints_every_n_steps,
                checkpoint_max_to_keep=150,
                record_summaries=True)
def main(_):
    with open('system_dict.json') as json_file:
        args = json.load(json_file)

    tf.config.set_soft_device_placement(True)

    if args["checkpoint_dir"]:
        model_lib_v2.eval_continuously(
            pipeline_config_path=args["pipeline_config_path"],
            model_dir=args["model_dir"],
            train_steps=args["num_train_steps"],
            sample_1_of_n_eval_examples=args["sample_1_of_n_eval_examples"],
            sample_1_of_n_eval_on_train_examples=(
                args["sample_1_of_n_eval_on_train_examples"]),
            checkpoint_dir=args["checkpoint_dir"],
            wait_interval=300,
            timeout=args["eval_timeout"])

    else:
        if args["use_tpu"]:
            # TPU is automatically inferred if tpu_name is None and
            # we are running under cloud ai-platform.
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                args["tpu_name"])
            tf.config.experimental_connect_to_cluster(resolver)
            tf.tpu.experimental.initialize_tpu_system(resolver)
            strategy = tf.distribute.experimental.TPUStrategy(resolver)
        elif args["num_workers"] > 1:
            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
        else:
            strategy = tf.compat.v2.distribute.MirroredStrategy()

        with strategy.scope():
            model_lib_v2.train_loop(
                pipeline_config_path=args["pipeline_config_path"],
                model_dir=args["model_dir"],
                train_steps=args["num_train_steps"],
                use_tpu=args["use_tpu"],
                checkpoint_every_n=args["checkpoint_every_n"],
                record_summaries=args["record_summaries"])
Esempio n. 12
0
    def run(self):
        self._validate_pipeline_config()

        if self._memory_growth:
            gpus = tf.config.experimental.list_physical_devices("GPU")
            if gpus:
                try:
                    # Currently, memory growth needs to be the same across GPUs
                    for gpu in gpus:
                        tf.config.experimental.set_memory_growth(gpu, True)
                except RuntimeError as e:
                    # Memory growth must be set before GPUs have been initialized
                    print(e)

        print("Running train loop...")
        strategy = tf.distribute.MirroredStrategy()
        with strategy.scope():
            model_lib_v2.train_loop(
                pipeline_config_path=self._pipeline_config_path,
                model_dir=self._training_loop_path,
                checkpoint_max_to_keep=None  # keep all checkpoints
            )
    testboundingbox = decoded_tensors['groundtruth_boxes'].numpy()
    # show_oneimage_category(testimage, testlabel, testboundingbox, IMAGE_SIZE)
    # cv2.imwrite('result.jpg', resultimage)

    cwd = os.getcwd()

    # Print the current working directory
    print("Current working directory: {0}".format(cwd))

    # Start the training, ref: https://github.com/tensorflow/models/blob/master/research/object_detection/model_main_tf2.py
    pipeline_config_path = '/Developer/MyRepo/WaymoObjectDetection/2DObject/tfobjectdetection/tf_ssdresnet50_1024_pipeline_P100.config'
    model_dir = '/Developer/MyRepo/mymodels/tf_ssdresnet50_output'
    
    num_train_steps = 150000
    steps_per_sec_list = []
    checkpoint_every_n=1000

    tf.config.set_soft_device_placement(True)
    strategy = tf.compat.v2.distribute.MirroredStrategy()


    with strategy.scope():
        #in: https://github.com/tensorflow/models/blob/master/research/object_detection/model_lib_v2.py
        model_lib_v2.train_loop(
            pipeline_config_path=pipeline_config_path,
            model_dir=model_dir,
            train_steps=num_train_steps,
            use_tpu=False,
            checkpoint_every_n=1000,
            record_summaries=True)
def main(unused_argv):
    if FLAGS.checkpoint_dir:
        print("\n-------Running evaluation")
    else:
        print("\n-------Running traingin!")
    flags.mark_flag_as_required('model_dir')
    flags.mark_flag_as_required('pipeline_config_path')
    tf.config.set_soft_device_placement(True)

    print(
        "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )

    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus),
                  "Logical GPUs")
        except RuntimeError as e:
            # Memory growth must be set before GPUs have been initialized
            print(e)

    print(
        "++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )

    if FLAGS.checkpoint_dir:
        model_lib_v2.eval_continuously(
            pipeline_config_path=FLAGS.pipeline_config_path,
            model_dir=FLAGS.model_dir,
            train_steps=FLAGS.num_train_steps,
            sample_1_of_n_eval_examples=FLAGS.sample_1_of_n_eval_examples,
            sample_1_of_n_eval_on_train_examples=(
                FLAGS.sample_1_of_n_eval_on_train_examples),
            checkpoint_dir=FLAGS.checkpoint_dir,
            wait_interval=300,
            timeout=FLAGS.eval_timeout)
    else:
        if FLAGS.use_tpu:
            # TPU is automatically inferred if tpu_name is None and
            # we are running under cloud ai-platform.
            resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                FLAGS.tpu_name)
            tf.config.experimental_connect_to_cluster(resolver)
            tf.tpu.experimental.initialize_tpu_system(resolver)
            strategy = tf.distribute.experimental.TPUStrategy(resolver)
        elif FLAGS.num_workers > 1:
            strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy()
        else:
            strategy = tf.compat.v2.distribute.MirroredStrategy()

        with strategy.scope():
            model_lib_v2.train_loop(
                pipeline_config_path=FLAGS.pipeline_config_path,
                model_dir=FLAGS.model_dir,
                save_final_config=True,
                train_steps=FLAGS.num_train_steps,
                use_tpu=FLAGS.use_tpu,
                checkpoint_every_n=FLAGS.checkpoint_every_n,
                record_summaries=FLAGS.record_summaries)