Beispiel #1
0
def main(job_dir, data_dir, num_gpus, variable_strategy,
         use_distortion_for_training, log_device_placement, num_intra_threads,
         **hparams):
    # The env variable is on deprecation path, default is set to off.
    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'
    print('hparams:', hparams)

    # Session configuration.
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=log_device_placement,
        intra_op_parallelism_threads=num_intra_threads,
        gpu_options=tf.GPUOptions(force_gpu_compatible=True))

    config = cifar10_utils.RunConfig(
        session_config=sess_config,
        model_dir=job_dir,
        save_checkpoints_steps=hparams["eval_steps"])
    tf.contrib.learn.learn_runner.run(
        get_experiment_fn(data_dir, num_gpus, variable_strategy,
                          use_distortion_for_training),
        run_config=config,
        schedule="train_and_evaluate",
        hparams=tf.contrib.training.HParams(is_chief=config.is_chief,
                                            **hparams))
Beispiel #2
0
def main(job_dir, data_dir, num_gpus, variable_strategy, data_format,
         use_distortion_for_training, log_device_placement, num_intra_threads,
         **hparams):
  # The env variable is on deprecation path, default is set to off.
  os.environ['TF_SYNC_ON_FINISH'] = '0'
  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

  # channels first (NCHW) is normally optimal on GPU and channels last (NHWC)
  # on CPU. The exception is Intel MKL on CPU which is optimal with
  # channels_last.
  if not data_format:
    if num_gpus == 0:
      data_format = 'channels_last'
    else:
      data_format = 'channels_first'

  # Session configuration.
  sess_config = tf.ConfigProto(
      allow_soft_placement=True,
      log_device_placement=log_device_placement,
      intra_op_parallelism_threads=num_intra_threads,
      gpu_options=tf.GPUOptions(force_gpu_compatible=True))

  config = cifar10_utils.RunConfig(
      session_config=sess_config, model_dir=job_dir)
  tf.contrib.learn.learn_runner.run(
      get_experiment_fn(data_dir, num_gpus, variable_strategy, data_format,
                        use_distortion_for_training),
      run_config=config,
      hparams=tf.contrib.training.HParams(**hparams))
def main(job_dir, data_dir, num_gpus, variable_strategy,
         use_distortion_for_training, log_device_placement, num_intra_threads,
         **hparams):
    # The env variable is on deprecation path, default is set to off.
    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Session configuration.
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=log_device_placement,
        intra_op_parallelism_threads=num_intra_threads,
        gpu_options=tf.GPUOptions(force_gpu_compatible=True))

    sess_config.gpu_options.allow_growth = True
    # run_config = tf.estimator.RunConfig().replace(session_config=session_config)

    config = cifar10_utils.RunConfig(session_config=sess_config,
                                     model_dir=job_dir)
    tf.contrib.learn.learn_runner.run(
        get_experiment_fn(data_dir, num_gpus, variable_strategy,
                          use_distortion_for_training),
        run_config=config,
        hparams=tf.contrib.training.HParams(is_chief=config.is_chief,
                                            **hparams))
Beispiel #4
0
def main(job_dir, data_dir, num_gpus, variable_strategy,
         use_distortion_for_training, log_device_placement, num_intra_threads,
         **hparams):
    # The env variable is on deprecation path, default is set to off.
    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Session configuration.
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=log_device_placement,
        intra_op_parallelism_threads=num_intra_threads,
        gpu_options=tf.GPUOptions(force_gpu_compatible=True))

    config = cifar10_utils.RunConfig(session_config=sess_config,
                                     model_dir=job_dir)
    run_config = config
    '''
  tf.contrib.learn.learn_runner.run(
      get_experiment_fn(data_dir, num_gpus, variable_strategy,
                        use_distortion_for_training),
      run_config=config,
      hparams=tf.contrib.training.HParams(
          is_chief=config.is_chief,
          **hparams))
  '''
    hparams = tf.contrib.training.HParams(is_chief=config.is_chief, **hparams)
    train_input_fn = functools.partial(
        input_fn,
        data_dir,
        subset='train',
        num_shards=num_gpus,
        batch_size=hparams.train_batch_size,
        use_distortion_for_training=use_distortion_for_training)
    eval_input_fn = functools.partial(input_fn,
                                      data_dir,
                                      subset='eval',
                                      batch_size=hparams.eval_batch_size,
                                      num_shards=num_gpus)
    num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval')
    if num_eval_examples % hparams.eval_batch_size != 0:
        raise ValueError(
            'validation set size must be multiple of eval_batch_size')

    train_steps = hparams.train_steps
    eval_steps = num_eval_examples // hparams.eval_batch_size

    # Comment the below line when training locally

    classifier = tf.estimator.Estimator(model_fn=get_model_fn(
        num_gpus, variable_strategy, run_config.num_worker_replicas or 1),
                                        config=config,
                                        params=hparams)
    train_spec = tf.estimator.TrainSpec(input_fn=train_input_fn,
                                        max_steps=train_steps)
    #exporter=tf.estimator.LatestExporter('exporter', serving_input_fn)
    eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn, steps=eval_steps)
    tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)
Beispiel #5
0
def main(job_dir, data_dir, num_gpus, variable_strategy,
         use_distortion_for_training, log_device_placement, num_intra_threads,
         **hparams):
    # The env variable is on deprecation path, default is set to off.
    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Session configuration.
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=log_device_placement,
        intra_op_parallelism_threads=num_intra_threads,
        gpu_options=tf.GPUOptions(force_gpu_compatible=True))

    # override checkpoint saver to not do anything
    #  from tensorflow.python.training import training
    #  def dummy(*args, **kwargs): return
    #  training.CheckpointSaverHook._save = dummy

    np.random.seed(1)
    tf.set_random_seed(1)

    # change event flush seconds to 1
    from tensorflow.python.summary.writer.writer import FileWriter
    old_init = FileWriter.__init__

    def newinit(*args, **kwargs):
        print("Overriding FileWriter flush_secs to 1")
        kwargs['flush_secs'] = 1
        old_init(*args, **kwargs)

    FileWriter.__init__ = newinit

    config = cifar10_utils.RunConfig(session_config=sess_config,
                                     model_dir=job_dir,
                                     tf_random_seed=1)
    tf.contrib.learn.learn_runner.run(
        get_experiment_fn(data_dir, num_gpus, variable_strategy,
                          use_distortion_for_training),
        run_config=config,
        hparams=tf.contrib.training.HParams(is_chief=config.is_chief,
                                            **hparams))

    print(vals)
Beispiel #6
0
def main(job_dir, data_dir, num_gpus, variable_strategy,
         use_distortion_for_training, log_device_placement, num_intra_threads,
         **hparams):
    # The env variable is on deprecation path, default is set to off.
    os.environ['TF_SYNC_ON_FINISH'] = '0'
    os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

    # Session configuration.
    sess_config = tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=log_device_placement,
        intra_op_parallelism_threads=num_intra_threads,
        gpu_options=tf.GPUOptions(force_gpu_compatible=True))

    # override default 100 steps. 122 e/sec = 4 steps/second
    config = cifar10_utils.RunConfig(session_config=sess_config,
                                     model_dir=job_dir,
                                     save_summary_steps=10)

    # change event flush seconds to 1
    from tensorflow.python.summary.writer.writer import FileWriter
    old_init = FileWriter.__init__

    def newinit(*args, **kwargs):
        new_flush_secs = hparams['event_flush_secs']
        print("Overriding FileWriter flush_secs to " + str(new_flush_secs))
        kwargs['flush_secs'] = new_flush_secs
        #    kwargs['flush_secs']=1
        old_init(*args, **kwargs)

    FileWriter.__init__ = newinit

    tf.contrib.learn.learn_runner.run(
        get_experiment_fn(data_dir, num_gpus, variable_strategy,
                          use_distortion_for_training),
        run_config=config,
        hparams=tf.contrib.training.HParams(is_chief=config.is_chief,
                                            **hparams))
Beispiel #7
0
def main(job_dir, data_dir, num_gpus, variable_strategy,
         use_distortion_for_training, log_device_placement, num_intra_threads,
         **hparams):
  # The env variable is on deprecation path, default is set to off.
  os.environ['TF_SYNC_ON_FINISH'] = '0'
  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

  # Session configuration.
  sess_config = tf.ConfigProto(
      allow_soft_placement=True,
      log_device_placement=log_device_placement,
      intra_op_parallelism_threads=num_intra_threads,
      gpu_options=tf.GPUOptions(force_gpu_compatible=True))

  config = cifar10_utils.RunConfig(
      session_config=sess_config, model_dir=job_dir)

  hparams = tf.contrib.training.HParams(
              is_chief=config.is_chief,
              **hparams)

  # # Edit this for steps you want to log
  # for i in range(41, 51):
  #   hparams.train_steps = i * 2000
  #   tf.contrib.learn.learn_runner.run(
  #       get_experiment_fn(data_dir, num_gpus, variable_strategy,
  #                         use_distortion_for_training),
  #       run_config=config,
  #       hparams=hparams)

  def evaluate_with_censor():
    """Evaluate model with censored image

    Goal: Examine one image only and evaluate it with different
    sections censored (i.e., blacked out). Create a heat-map of
    most important/distinguishing pixels for analysis.
    """

    # Create estimator.
    eval_input_fn = functools.partial(
        input_fn,
        data_dir,
        subset='eval',
        batch_size=hparams.eval_batch_size,
        num_shards=num_gpus)

    classifier = tf.estimator.Estimator(
        model_fn=get_model_fn(num_gpus, variable_strategy,
                              config.num_worker_replicas or 1),
        config=config,
        params=hparams)

    num_eval_examples = cifar10.Cifar10DataSet.num_examples_per_epoch('eval')
    if num_eval_examples % hparams.eval_batch_size != 0:
      raise ValueError(
          'validation set size must be multiple of eval_batch_size')

    eval_steps = num_eval_examples // hparams.eval_batch_size

    experiment = tf.contrib.learn.Experiment(
        classifier,
        train_input_fn=None,
        eval_input_fn=eval_input_fn,
        train_steps=1,
        eval_steps=eval_steps)

    tf.contrib.learn.learn_runner.run(
        lambda x, y: experiment,
        run_config=config,
        hparams=hparams)

  evaluate_with_censor()
Beispiel #8
0
    def _setup(self, config):

        # The env variable is on deprecation path, default is set to off.
        os.environ["TF_SYNC_ON_FINISH"] = "0"
        os.environ["TF_ENABLE_WINOGRAD_NONFUSED"] = "1"

        # Session configuration.
        sess_config = tf.ConfigProto(
            allow_soft_placement=True,
            log_device_placement=args.log_device_placement,
            intra_op_parallelism_threads=args.num_intra_threads,
            gpu_options=tf.GPUOptions(force_gpu_compatible=True,
                                      allow_growth=True),
        )

        # Convert to actual hyperparameter values here using the grid (discrete) input
        hparams["train_batch_size"] = 2**(int(config["batch_size"]) + 5)
        hparams["momentum"] = 0.4 + (0.55 * int(config["momentum"]) /
                                     args.precision)
        hparams["weight_decay"] = 1e-4 + (1e-4 * int(config["weight_decay"]) /
                                          args.precision)
        hparams["batch_norm_decay"] = 0.8 + (
            0.199 * int(config["batch_norm_decay"]) / args.precision)
        hparams["batch_norm_epsilon"] = 1e-5 + (
            0.00099 * int(config["batch_norm_epsilon"]) / args.precision)
        hparams["learning_rate"] = 0.01 + (0.1 * int(config["learning_rate"]) /
                                           args.precision)
        opt = int(config["optimizer"])
        if opt == 0:
            hparams["optimizer"] = "momentum"
        elif opt == 1:
            hparams["optimizer"] = "adam"
        elif opt == 2:
            hparams["optimizer"] = "adagrad"
        elif opt == 3:
            hparams["optimizer"] = "adadelta"
        elif opt == 4:
            hparams["optimizer"] = "sgd"
        else:
            hparams["optimizer"] = "rmsprop"

        # Calculate number of steps per one epoch
        self.train_steps = cifar10.Cifar10DataSet.num_examples_per_epoch(
            "train") // (hparams["train_batch_size"])

        # TODO: Fix checkpoint dir
        run_config = cifar10_utils.RunConfig(
            session_config=sess_config,
            model_dir=None,
            save_checkpoints_secs=None,
            save_checkpoints_steps=self.train_steps,
            keep_checkpoint_max=None,
            keep_checkpoint_every_n_hours=None,
        )
        self.run_config = run_config

        self.train_input_fn, self.eval_input_fn, self.estimator = build_estimator(
            data_dir=args.data_dir,
            num_gpus=args.num_gpus,
            variable_strategy=args.variable_strategy,
            use_distortion_for_training=args.use_distortion_for_training,
            run_config=run_config,
            hparams=tf.contrib.training.HParams(is_chief=run_config.is_chief,
                                                **hparams),
        )

        self.logger = logging.getLogger("metrics")
        self.logger.setLevel(logging.INFO)
        file_handler = logging.FileHandler(args.log_path)
        self.logger.addHandler(file_handler)

        self.logger.info(f"[CONFIG] ID={self._experiment_id} config={hparams}")
def main(job_dir, data_dir, num_gpus, variable_strategy,
         use_distortion_for_training, log_device_placement, num_intra_threads,
         **hparams):
  # The env variable is on deprecation path, default is set to off.
  os.environ['TF_SYNC_ON_FINISH'] = '0'
  os.environ['TF_ENABLE_WINOGRAD_NONFUSED'] = '1'

<<<<<<< HEAD
  # Session configuration.
  sess_config = tf.ConfigProto(
      allow_soft_placement=True,
      log_device_placement=log_device_placement,
      intra_op_parallelism_threads=num_intra_threads,
      gpu_options=tf.GPUOptions(force_gpu_compatible=True))

  config = cifar10_utils.RunConfig(
      session_config=sess_config, model_dir=job_dir)
  tf.contrib.learn.learn_runner.run(
      get_experiment_fn(data_dir, num_gpus, variable_strategy,
                        use_distortion_for_training),
      run_config=config,
      hparams=tf.contrib.training.HParams(**hparams))
=======
  if FLAGS.num_gpus < 0:
    raise ValueError(
        'Invalid GPU count: \"num_gpus\" must be 0 or a positive integer.')
  if FLAGS.num_gpus == 0 and not FLAGS.is_cpu_ps:
    raise ValueError(
        'No GPU available for use, must use CPU as parameter server.')
  if (FLAGS.num_layers - 2) % 6 != 0:
    raise ValueError('Invalid num_layers parameter.')
  if FLAGS.num_gpus != 0 and FLAGS.train_batch_size % FLAGS.num_gpus != 0: