Ejemplo n.º 1
0
def _call_exception_handlers(exception):
  """Calls any installed exception handlers."""
  for handler in EXCEPTION_HANDLERS:
    try:
      if handler.wants(exception):
        handler.handle(exception)
    except:  # pylint: disable=bare-except
      try:
        # We don't want to stop for exceptions in the exception handlers but
        # we shouldn't hide them either.
        logging.error(traceback.format_exc())
      except:  # pylint: disable=bare-except
        # In case even the logging statement fails, ignore.
        pass
Ejemplo n.º 2
0
def main(_):
  """Run and handle retryable errors."""
  proto_utils.uses_fast_cpp_protos_or_die()

  logging_level.set_from_flag()
  for _ in range(FLAGS.num_retries + 1):
    try:
      parse_and_run()
      return
    except tf.errors.UnavailableError as e:
      # An UnavailableError indicates a gRPC error, typically this is
      # retryable.
      logging.error('Caught UnavailableError %s; will retry.', e)
    except tf.errors.InternalError as e:
      # Retry on an InternalError.
      logging.error('Caught InternalError %s; will retry.', e)
Ejemplo n.º 3
0
def main(_):
  proto_utils.uses_fast_cpp_protos_or_die()

  if not FLAGS.dataset_config_pbtxt:
    logging.error('Need to specify --dataset_config_pbtxt')
  logging_level.set_from_flag()
  eval_loop(
      master=FLAGS.master,
      dataset_config_pbtxt=FLAGS.dataset_config_pbtxt,
      checkpoint_dir=FLAGS.checkpoint_dir,
      model_name=FLAGS.model_name,
      batch_size=FLAGS.batch_size,
      moving_average_decay=FLAGS.moving_average_decay,
      max_examples=FLAGS.max_examples,
      eval_dir=FLAGS.eval_dir,
      max_evaluations=FLAGS.max_evaluations,
  )
def evaluate_tfhub_module(module_spec, eval_tasks, use_tpu,
                          num_averaging_runs, update_bn_accumulators=True, use_tags=True):
  """Evaluate model at given checkpoint_path.

  Args:
    module_spec: string, path to a TF hub module.
    eval_tasks: List of objects that inherit from EvalTask.
    use_tpu: Whether to use TPUs.
    num_averaging_runs: Determines how many times each metric is computed.

  Returns:
    Dict[Text, float] with all the computed results.

  Raises:
    NanFoundError: If generator output has any NaNs.
  """
  # Make sure that the same latent variables are used for each evaluation.
  np.random.seed(42)
  dataset = datasets.get_dataset()
  num_test_examples = dataset.eval_test_samples

  batch_size = 64
  num_batches = int(np.ceil(num_test_examples / batch_size))

  # Load and update the generator.
  result_dict = {}
  fake_dsets = []
  with tf.Graph().as_default():
    tf.set_random_seed(42)
    with tf.Session() as sess:
      if use_tpu:
        sess.run(tf.contrib.tpu.initialize_system())
      def sample_from_generator():
        """Create graph for sampling images."""
        generator = hub.Module(
            module_spec,
            name="gen_module",
            tags={"gen", "bs{}".format(batch_size)} if use_tags else None)
        logging.info("Generator inputs: %s", generator.get_input_info_dict())
        z_dim = generator.get_input_info_dict()["z"].get_shape()[1].value
        z = z_generator(shape=[batch_size, z_dim])
        if "labels" in generator.get_input_info_dict():
          # Conditional GAN.
          assert dataset.num_classes
          labels = tf.random.uniform(
              [batch_size], maxval=dataset.num_classes, dtype=tf.int32)
          inputs = dict(z=z, labels=labels)
        else:
          # Unconditional GAN.
          assert "labels" not in generator.get_input_info_dict()
          inputs = dict(z=z)
        return generator(inputs=inputs, as_dict=True)["generated"]
      if use_tpu:

        generated = tf.contrib.tpu.rewrite(sample_from_generator)
      else:
        generated = sample_from_generator()

      tf.global_variables_initializer().run()


      if update_bn_accumulators and _update_bn_accumulators(sess, generated, num_accu_examples=204800):
        saver = tf.train.Saver()
        save_path = os.path.join(module_spec, "model-with-accu.ckpt")
        checkpoint_path = saver.save(
            sess,
            save_path=save_path)
        logging.info("Exported generator with accumulated batch stats to "
                     "%s.", checkpoint_path)
      if not eval_tasks:
        logging.error("Task list is empty, returning.")
        return
      for i in range(num_averaging_runs):
        logging.info("Generating fake data set %d/%d.", i+1, num_averaging_runs)
        fake_dset = eval_utils.EvalDataSample(
            eval_utils.sample_fake_dataset(sess, generated, num_batches, batch_size))
        fake_dsets.append(fake_dset)
        logging.info("Computing inception features for generated data %d/%d.",
                     i+1, num_averaging_runs)
        activations, logits = eval_utils.inception_transform_np(
            fake_dset.images, batch_size)
        fake_dset.set_inception_features(
            activations=activations, logits=logits)
        fake_dset.set_num_examples(num_test_examples)
        # Free up some memory by releasing additional fake data samples.
        # For ImageNet128 50k images are ~9 GiB. This will blow up metrics
        # (such as fractal dimension) if num_averaging_runs > 1.
        fake_dset.discard_images()

  real_dset = eval_utils.EvalDataSample(
      eval_utils.get_real_images(
          dataset=dataset, num_examples=num_test_examples))
  logging.info("Getting Inception features for real images.")
  real_dset.activations, _ = eval_utils.inception_transform_np(
      real_dset.images, batch_size)
  real_dset.set_num_examples(num_test_examples)

  # Run all the tasks and update the result dictionary with the task statistics.
  result_dict = {}
  for task in eval_tasks:
    task_results_dicts = [
        task.run_after_session(fake_dset, real_dset)
        for fake_dset in fake_dsets
    ]
    # Average the score for each key.
    result_statistics = {}
    for key in task_results_dicts[0].keys():
      scores_for_key = np.array([d[key] for d in task_results_dicts])
      mean, std = np.mean(scores_for_key), np.std(scores_for_key)
      scores_as_string = "_".join([str(x) for x in scores_for_key])
      result_statistics[key + "_mean"] = mean
      result_statistics[key + "_std"] = std
      result_statistics[key + "_list"] = scores_as_string
    logging.info("Computed results for task %s: %s", task, result_statistics)

    result_dict.update(result_statistics)
  return result_dict
Ejemplo n.º 5
0
# Copyright 2017 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================

"""Loads icp op."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl import logging
import tensorflow as tf

try:
  icp_op_module = tf.load_op_library('./ops/icp_op.so')
  icp = icp_op_module.icp
except Exception:  # pylint: disable=broad-except
  logging.error('Could not load object file for ICP op.')
  icp = None
Ejemplo n.º 6
0
def get_dataset_feature_statistics(builder, split):
    """Calculate statistics for the specified split."""
    statistics = statistics_pb2.DatasetFeatureStatistics()

    # Make this to the best of our abilities.
    schema = schema_pb2.Schema()

    dataset = builder.as_dataset(split=split)

    # Just computing the number of examples for now.
    statistics.num_examples = 0

    # Feature dictionaries.
    feature_to_num_examples = collections.defaultdict(int)
    feature_to_min = {}
    feature_to_max = {}

    np_dataset = dataset_utils.dataset_as_numpy(dataset)
    for example in tqdm.tqdm(np_dataset, unit=" examples"):
        statistics.num_examples += 1

        assert isinstance(example, dict)

        feature_names = sorted(example.keys())
        for feature_name in feature_names:

            # Update the number of examples this feature appears in.
            feature_to_num_examples[feature_name] += 1

            feature_np = example[feature_name]

            # For compatibility in graph and eager mode, we can get PODs here and
            # everything may not be neatly wrapped up in numpy's ndarray.

            feature_dtype = type(feature_np)

            if isinstance(feature_np, np.ndarray):
                feature_dtype = feature_np.dtype.type

            feature_min, feature_max = None, None
            is_numeric = (np.issubdtype(feature_dtype, np.number)
                          or feature_dtype == np.bool_)
            if is_numeric:
                feature_min = np.min(feature_np)
                feature_max = np.max(feature_np)

            # TODO(afrozm): What if shapes don't match? Populate ValueCount? Add
            # logic for that.

            # Set or update the min, max.
            if is_numeric:
                if ((feature_name not in feature_to_min)
                        or (feature_to_min[feature_name] > feature_min)):
                    feature_to_min[feature_name] = feature_min

                if ((feature_name not in feature_to_max)
                        or (feature_to_max[feature_name] < feature_max)):
                    feature_to_max[feature_name] = feature_max

    # Start here, we've processed all examples.

    output_shapes_dict = dataset.output_shapes
    output_types_dict = dataset.output_types

    for feature_name in sorted(feature_to_num_examples.keys()):
        # Try to fill in the schema.
        feature = schema.feature.add()
        feature.name = feature_name

        # TODO(afrozm): Make this work with nested structures, currently the Schema
        # proto has no support for it.
        maybe_feature_shape = output_shapes_dict[feature_name]
        if not isinstance(maybe_feature_shape, tf.TensorShape):
            logging.error(
                "Statistics generation doesn't work for nested structures yet")
            continue

        for dim in maybe_feature_shape.as_list():
            # We denote `None`s as -1 in the shape proto.
            feature.shape.dim.add().size = dim if dim else -1
        feature_type = output_types_dict[feature_name]
        feature.type = _FEATURE_TYPE_MAP.get(feature_type, schema_pb2.BYTES)

        common_statistics = statistics_pb2.CommonStatistics()
        common_statistics.num_non_missing = feature_to_num_examples[
            feature_name]
        common_statistics.num_missing = (statistics.num_examples -
                                         common_statistics.num_non_missing)

        feature_name_statistics = statistics.features.add()
        feature_name_statistics.name = feature_name

        # TODO(afrozm): This can be skipped, since type information was added to
        # the Schema.
        feature_name_statistics.type = _SCHEMA_TYPE_MAP.get(
            feature.type, statistics_pb2.FeatureNameStatistics.BYTES)

        if feature.type == schema_pb2.INT or feature.type == schema_pb2.FLOAT:
            numeric_statistics = statistics_pb2.NumericStatistics()
            numeric_statistics.min = feature_to_min[feature_name]
            numeric_statistics.max = feature_to_max[feature_name]
            numeric_statistics.common_stats.CopyFrom(common_statistics)
            feature_name_statistics.num_stats.CopyFrom(numeric_statistics)
        else:
            # Let's shove it into BytesStatistics for now.
            bytes_statistics = statistics_pb2.BytesStatistics()
            bytes_statistics.common_stats.CopyFrom(common_statistics)
            feature_name_statistics.bytes_stats.CopyFrom(bytes_statistics)

    return statistics, schema
Ejemplo n.º 7
0
def _load_csv_files(manual_dir, dictionary_of_csv_files):
    """Load the ground-truth data from the given dictionary of CSV files.

  Args:
    manual_dir: Path of the directory containing the images.
    dictionary_of_csv_files: Dictionary containing the key and filepath of each
      CSV file to load.

  Returns:
    A dictionary containing the ground-truth loaded from the CSV files.
  """
    # Data maps patients -> examples -> list of abnormalities
    data = {}
    for csv_key, csv_path in sorted(dictionary_of_csv_files.items()):
        with tf.io.gfile.GFile(csv_path, 'r') as f:
            csv_reader = csv.DictReader(f)
            for i, row in enumerate(csv_reader, 2):
                row = {k: v.strip()
                       for k, v in row.items()}  # Strip all cells.
                # Construct example ID from the study and series IDs.
                example_id = _DCIM_REGEX.sub(r'\g<study>/\g<series>',
                                             row['image file path'])
                # Get path to the
                for key in [
                        'image file path', 'ROI mask file path',
                        'cropped image file path'
                ]:
                    row[key] = row[key].replace('.dcm', '.png')
                    row[key] = os.path.join(manual_dir, *row[key].split('/'))
                    if not tf.io.gfile.exists(row[key]):
                        raise ValueError(
                            'Error processing line %d from csv file %s: '
                            'Image %r does not exist!' %
                            (i, csv_path, row[key]))

                mask_file_path = row['ROI mask file path']
                crop_file_path = row['cropped image file path']
                full_image = _read_image(row['image file path'])
                mask_image = _read_image(mask_file_path)
                crop_image = _read_image(crop_file_path)
                if full_image.shape == crop_image.shape:
                    # TODO(jpuigcerver): THIS ASSUMES THAT THE CROP/MASK COLUMNS ARE JUST
                    # REVERSED. I've checked that this is the case for a couple of rows,
                    # but this issue happens a lot across all CSV files. Contact the
                    # owners of the dataset to ask about this problem.
                    mask_file_path, crop_file_path = crop_file_path, mask_file_path
                elif full_image.shape != mask_image.shape:
                    # TODO(jpuigcerver): Contact the owners of the dataset to ask about
                    # this problem.
                    logging.error(
                        'Error processing line %d from csv file %s: No suitable mask for '
                        'the given image (expected size: %r, candidate sizes: %r). '
                        'This abnormality will NOT be included in the dataset.',
                        i, csv_path, full_image.shape,
                        [mask_image.shape, crop_image.shape])
                    continue

                abnormality = {
                    'id': int(row['abnormality id']),
                    'mask': mask_file_path,
                    'assessment': row['assessment'],
                    'pathology': row['pathology'],
                    'subtlety': row['subtlety'],
                }
                if 'calc type' in row and 'calc distribution' in row:
                    abnormality['type'] = 'calc'
                    abnormality['calc_type'] = row['calc type']
                    abnormality['calc_distribution'] = row['calc distribution']
                elif 'mass shape' in row and 'mass margins' in row:
                    abnormality['type'] = 'mass'
                    abnormality['mass_shape'] = row['mass shape']
                    abnormality['mass_margins'] = row['mass margins']
                else:
                    raise ValueError('CSV file is missing required columns.')

                example = {
                    'id': example_id,
                    'breast': row['left or right breast'],
                    'patient': row['patient_id'],
                    'image': row['image file path'],
                    'view': row['image view'],
                    'abnormalities': [abnormality],
                    # Note: Useful to know whether the example is from train or test.
                    'csv_key': csv_key,
                }
                _append_example_to_data(data, example)

    return data
Ejemplo n.º 8
0
def create_optimizer_from_flags(
    prefix: Text,
    overrides: Optional[Mapping[Text, Union[Text, float, int, bool]]] = None
) -> tf.keras.optimizers.Optimizer:
    """Returns an optimizer based on prefixed flags.

  This method is inteded to be paired with `define_optimizer_flags` using the
  same `prefix`, to allow Python binaries to constructed TensorFlow optimizers
  parameterized by commandline flags.

  This method expects at least two flags to have been defined:
    * `--<prefix>_optimizer=<optimizer name>`
    * `--<prefix>_learning_rate`

  In addition to suites of flags for each optimizer:
    * `--<prefix>_<optimizer name>_<constructor_argument>`

  For example, if `prefix='client'` this method first reads the flags:
    * `--client_optimizer`
    * `--client_learning_rate`

  If the optimizer flag is `'sgd'`, then a `tf.keras.optimizer.SGD` optimizer is
  constructed using the values in the flags prefixed with  `--client_sgd_`.

  NOTE: `kwargs` can be set using the `overrides` parameter.

  Args:
    prefix: The same string prefix passed to `define_optimizer_flags`.
    overrides: A mapping of `(string, value)` pairs that should override default
      flag values (but not user specified values from the commandline).

  Returns:
    A `tf.keras.optimizers.Optimizer`.
  """
    if overrides is not None:
        if not isinstance(overrides, collections.Mapping):
            raise TypeError(
                '`overrides` must be a value of type `collections.Mapping`, '
                'found type: {!s}'.format(type(overrides)))
    else:
        overrides = {}

    def prefixed(basename):
        return '{}_{}'.format(prefix, basename) if prefix else basename

    optimizer_flag_name = prefixed('optimizer')
    if flags.FLAGS[optimizer_flag_name] is None:
        raise ValueError(
            'Must specify flag --{!s}'.format(optimizer_flag_name))
    optimizer_name = flags.FLAGS[optimizer_flag_name].value
    optimizer_cls = _SUPPORTED_OPTIMIZERS.get(optimizer_name)
    if optimizer_cls is None:
        # To support additional optimizers, implement it as a
        # `tf.keras.optimizers.Optimizer` and add to the `_SUPPORTED_OPTIMIZERS`
        # dict.
        logging.error(
            'Unknown optimizer [%s], known optimziers are [%s]. To add '
            'support for an optimizer, add the optimzier class to the '
            'utils_impl._SUPPORTED_OPTIMIZERS list.', optimizer_name,
            list(_SUPPORTED_OPTIMIZERS.keys()))
        raise ValueError(
            '`{!s}` is not a valid optimizer for flag --{!s}, must be '
            'one of {!s}. See error log for details.'.format(
                optimizer_name, optimizer_flag_name,
                list(_SUPPORTED_OPTIMIZERS.keys())))

    def _has_user_value(flag):
        """Check if a commandline flag has a user set value."""
        return flag.present or flag.value != flag.default

    # Validate that the optimizers that weren't picked don't have flag values set.
    # Settings that won't be used likely means there is an expectation gap between
    # the user and the system and we should notify them.
    unused_flag_prefixes = [
        prefixed(k) for k in _SUPPORTED_OPTIMIZERS.keys()
        if k != optimizer_name
    ]
    mistakenly_set_flags = []
    for flag_name in flags.FLAGS:
        if not _has_user_value(flags.FLAGS[flag_name]):
            # Flag was not set by the user, skip it.
            continue
        # Otherwise the flag has a value set by the user.
        for unused_prefix in unused_flag_prefixes:
            if flag_name.startswith(unused_prefix):
                mistakenly_set_flags.append(flag_name)
                break
    if mistakenly_set_flags:
        raise ValueError('Commandline flags for optimizers other than [{!s}] '
                         '(value of --{!s}) are set. These would be ignored, '
                         'were the flags set by mistake? Flags: {!s}'.format(
                             optimizer_name, optimizer_flag_name,
                             mistakenly_set_flags))

    flag_prefix = prefixed(optimizer_name)
    prefix_len = len(flag_prefix) + 1
    kwargs = dict(overrides) if overrides is not None else {}
    learning_rate_flag = flags.FLAGS[prefixed('learning_rate')]
    if _has_user_value(learning_rate_flag):
        kwargs['learning_rate'] = learning_rate_flag.value
    for flag_name in flags.FLAGS:
        if not flag_name.startswith(flag_prefix):
            continue
        arg_name = flag_name[prefix_len:]
        kwargs[arg_name] = flags.FLAGS[flag_name].value
    return optimizer_cls(**kwargs)
Ejemplo n.º 9
0
    def train(self,
              train_input_fn: Callable[[params_dict.ParamsDict],
                                       tf.data.Dataset],
              eval_input_fn: Callable[[params_dict.ParamsDict],
                                      tf.data.Dataset] = None,
              model_dir: Text = None,
              total_steps: int = 1,
              iterations_per_loop: int = 1,
              train_metric_fn: Callable[[], Any] = None,
              eval_metric_fn: Callable[[], Any] = None,
              summary_writer_fn: Callable[[Text, Text],
                                          SummaryWriter] = SummaryWriter,
              init_checkpoint: Callable[[tf.keras.Model], Any] = None,
              custom_callbacks: List[tf.keras.callbacks.Callback] = None,
              save_config: bool = True):
        """Runs distributed training.

    Args:
      train_input_fn: (params: dict) -> tf.data.Dataset training data input
        function.
      eval_input_fn: (Optional) same type as train_input_fn. If not None, will
        trigger evaluting metric on eval data. If None, will not run eval step.
      model_dir: the folder path for model checkpoints.
      total_steps: total training steps.
      iterations_per_loop: train steps per loop. After each loop, this job will
        update metrics like loss and save checkpoint.
      train_metric_fn: metric_fn for evaluation in train_step.
      eval_metric_fn: metric_fn for evaluation in test_step.
      summary_writer_fn: function to create summary writer.
      init_checkpoint: function to load checkpoint.
      custom_callbacks: A list of Keras Callbacks objects to run during
        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
        methods are invoked during training.
      save_config: bool. Whether to save params to model_dir.

    Returns:
      The training loss and eval metrics.
    """
        assert train_input_fn is not None
        if train_metric_fn and not callable(train_metric_fn):
            raise ValueError('if `train_metric_fn` is specified, '
                             'train_metric_fn must be a callable.')
        if eval_metric_fn and not callable(eval_metric_fn):
            raise ValueError('if `eval_metric_fn` is specified, '
                             'eval_metric_fn must be a callable.')
        train_metric_fn = train_metric_fn or _no_metric
        eval_metric_fn = eval_metric_fn or _no_metric

        if custom_callbacks and iterations_per_loop != 1:
            logging.error(
                'It is sematically wrong to run callbacks when '
                'iterations_per_loop is not one (%s)', iterations_per_loop)

        def _run_callbacks_on_batch_begin(batch):
            """Runs custom callbacks at the start of every step."""
            if not custom_callbacks:
                return
            for callback in custom_callbacks:
                if callback:
                    callback.on_batch_begin(batch)

        def _run_callbacks_on_batch_end(batch):
            """Runs custom callbacks at the end of every step."""
            if not custom_callbacks:
                return
            for callback in custom_callbacks:
                if callback:
                    callback.on_batch_end(batch)

        if save_config:
            self._save_config(model_dir)

        if FLAGS.save_checkpoint_freq:
            save_freq = FLAGS.save_checkpoint_freq
        else:
            save_freq = iterations_per_loop

        params = self._params
        strategy = self._strategy
        # To reduce unnecessary send/receive input pipeline operation, we place
        # input pipeline ops in worker task.
        train_iterator = self._get_input_iterator(train_input_fn, strategy)
        train_loss = None
        eval_metric_result = None
        with strategy.scope():
            # To correctly place the model weights on accelerators,
            # model and optimizer should be created in scope.
            model = self.model_fn(params.as_dict())
            if not hasattr(model, 'optimizer'):
                raise ValueError(
                    'User should set optimizer attribute to model '
                    'inside `model_fn`.')
            optimizer = model.optimizer

            # Training loop starts here.
            checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
            latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
            initial_step = 0
            if latest_checkpoint_file:
                logging.info(
                    'Checkpoint file %s found and restoring from '
                    'checkpoint', latest_checkpoint_file)
                checkpoint.restore(latest_checkpoint_file)
                initial_step = optimizer.iterations.numpy()
                logging.info(
                    'Loading from checkpoint file completed. Init step %d',
                    initial_step)
            elif init_checkpoint:
                logging.info('Restoring from init checkpoint function')
                init_checkpoint(model)
                logging.info('Loading from init checkpoint file completed')

            current_step = optimizer.iterations.numpy()
            checkpoint_name = self.checkpoint_name

            eval_metric = eval_metric_fn()
            train_metric = train_metric_fn()
            train_summary_writer = summary_writer_fn(model_dir, 'eval_train')
            test_summary_writer = summary_writer_fn(model_dir, 'eval_test')

        # Continue training loop.
        train_step = self._create_train_step(strategy=strategy,
                                             model=model,
                                             loss_fn=self.loss_fn(),
                                             optimizer=optimizer,
                                             metric=train_metric)
        test_step = None
        if eval_input_fn and eval_metric:
            test_step = self._create_test_step(strategy,
                                               model,
                                               metric=eval_metric)

        logging.info('Training started')
        last_save_checkpoint_step = current_step
        while current_step < total_steps:

            num_steps = _steps_to_run(current_step, total_steps,
                                      iterations_per_loop)
            _run_callbacks_on_batch_begin(current_step)
            train_loss = train_step(
                train_iterator, tf.convert_to_tensor(num_steps,
                                                     dtype=tf.int32))
            _run_callbacks_on_batch_end(current_step)
            current_step += num_steps

            train_loss = tf.nest.map_structure(
                lambda x: x.numpy().astype(float), train_loss)
            if not isinstance(train_loss, dict):
                train_loss = {'total_loss': train_loss}
            if np.isnan(train_loss['total_loss']):
                raise ValueError('total loss is NaN.')

            if train_metric:
                train_metric_result = train_metric.result()
                if isinstance(train_metric, tf.keras.metrics.Metric):
                    train_metric_result = tf.nest.map_structure(
                        lambda x: x.numpy().astype(float), train_metric_result)
                if not isinstance(train_metric_result, dict):
                    train_metric_result = {'metric': train_metric_result}
                train_metric_result.update(train_loss)
            else:
                train_metric_result = train_loss
            if callable(optimizer.lr):
                train_metric_result.update(
                    {'learning_rate': optimizer.lr(current_step).numpy()})
            else:
                train_metric_result.update(
                    {'learning_rate': optimizer.lr.numpy()})
            logging.info(
                'Train Step: %d/%d  / loss = %s / training metric = %s',
                current_step, total_steps, train_loss, train_metric_result)

            train_summary_writer(metrics=train_metric_result,
                                 step=optimizer.iterations)

            # Saves model checkpoints and run validation steps at every
            # iterations_per_loop steps.
            # To avoid repeated model saving, we do not save after the last
            # step of training.
            if save_freq > 0 and current_step < total_steps and (
                    current_step - last_save_checkpoint_step) >= save_freq:
                _save_checkpoint(checkpoint, model_dir,
                                 checkpoint_name.format(step=current_step))
                last_save_checkpoint_step = current_step

            if test_step:
                eval_iterator = self._get_input_iterator(
                    eval_input_fn, strategy)
                eval_metric_result = self._run_evaluation(
                    test_step, current_step, eval_metric, eval_iterator)
                logging.info('Step: %s evalation metric = %s.', current_step,
                             eval_metric_result)
                test_summary_writer(metrics=eval_metric_result,
                                    step=optimizer.iterations)

            # Re-initialize evaluation metric, except the last step.
            if eval_metric and current_step < total_steps:
                eval_metric.reset_states()
            if train_metric and current_step < total_steps:
                train_metric.reset_states()

        # Reaches the end of training and saves the last checkpoint.
        if last_save_checkpoint_step < total_steps:
            _save_checkpoint(checkpoint, model_dir,
                             checkpoint_name.format(step=current_step))

        if test_step:
            logging.info(
                'Running final evaluation after training is complete.')
            eval_iterator = self._get_input_iterator(eval_input_fn, strategy)
            eval_metric_result = self._run_evaluation(test_step, current_step,
                                                      eval_metric,
                                                      eval_iterator)
            logging.info('Final evaluation metric = %s.', eval_metric_result)
            test_summary_writer(metrics=eval_metric_result,
                                step=optimizer.iterations)

        return train_loss, eval_metric_result
Ejemplo n.º 10
0
  mask = mask_factory.create_mask(FLAGS.mask_type, base_model, mask_rng,
                                  FLAGS.mask_sparsity)

  if jax.host_id() == 0:
    mask_stats = symmetry.get_mask_stats(mask)
    logging.info('Mask stats: %s', str(mask_stats))


    for label, value in mask_stats.items():
      try:
        summary_writer.scalar(f'mask/{label}', value, 0)
      # This is needed because permutations (long int) can't be cast to float32.
      except (OverflowError, ValueError):
        summary_writer.text(f'mask/{label}', str(value), 0)
        logging.error('Could not write mask/%s to tensorflow summary as float32'
                      ', writing as string instead.', label)

    if FLAGS.dump_json:
      mask_stats['permutations'] = str(mask_stats['permutations'])
      utils.dump_dict_json(
          mask_stats, path.join(experiment_dir, 'mask_stats.json'))

  mask = masked.propagate_masks(mask)

  if jax.host_id() == 0:
    mask_stats = symmetry.get_mask_stats(mask)
    logging.info('Propagated mask stats: %s', str(mask_stats))


    for label, value in mask_stats.items():
      try:
Ejemplo n.º 11
0
def _launch_aip_training(
        job_id: Text,
        project: Text,
        training_input: Dict[Text, Any],
        job_labels: Optional[Dict[Text, Text]] = None) -> None:
    """Launches and monitors a AIP custom training job.

  Args:
    job_id: the job ID of the AI Platform training job.
    project: the GCP project under which the training job will be executed.
    training_input: Training input argument for AI Platform training job. See
      https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput
      for the detailed schema.
    job_labels: the dict of labels that will be attached to this job.

  Raises:
    RuntimeError: if the Google Cloud AI Platform training job failed/cancelled.
    ConnectionError: if the status polling of the training job failed due to
      connection issue.
  """
    # Configure AI Platform training job
    api_client = discovery.build('ml', 'v1')
    project_id = 'projects/{}'.format(project)
    job_spec = {
        'jobId': job_id,
        'trainingInput': training_input,
        'labels': job_labels,
    }

    # Submit job to AIP Training
    logging.info('TrainingInput=%s', training_input)
    logging.info('Submitting job=\'%s\', project=\'%s\' to AI Platform.',
                 job_id, project)
    request = api_client.projects().jobs().create(body=job_spec,
                                                  parent=project_id)
    request.execute()

    # Wait for AIP Training job to finish
    job_name = '{}/jobs/{}'.format(project_id, job_id)
    request = api_client.projects().jobs().get(name=job_name)
    response = request.execute()
    retry_count = 0

    # Monitors the long-running operation by polling the job state periodically,
    # and retries the polling when a transient connectivity issue is encountered.
    #
    # Long-running operation monitoring:
    #   The possible states of "get job" response can be found at
    #   https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#State
    #   where SUCCEEDED/FAILED/CANCELLED are considered to be final states.
    #   The following logic will keep polling the state of the job until the job
    #   enters a final state.
    #
    # During the polling, if a connection error was encountered, the GET request
    # will be retried by recreating the Python API client to refresh the lifecycle
    # of the connection being used. See
    # https://github.com/googleapis/google-api-python-client/issues/218
    # for a detailed description of the problem. If the error persists for
    # _CONNECTION_ERROR_RETRY_LIMIT consecutive attempts, the function will raise
    # ConnectionError.
    while response['state'] not in ('SUCCEEDED', 'FAILED', 'CANCELLED'):
        time.sleep(_POLLING_INTERVAL_IN_SECONDS)
        try:
            response = request.execute()
            retry_count = 0
        # Handle transient connection error.
        except ConnectionError as err:
            if retry_count < _CONNECTION_ERROR_RETRY_LIMIT:
                retry_count += 1
                logging.warning(
                    'ConnectionError (%s) encountered when polling job: %s. Trying to '
                    'recreate the API client.', err, job_id)
                # Recreate the Python API client.
                api_client = discovery.build('ml', 'v1')
                request = api_client.projects().jobs().get(name=job_name)
            else:
                logging.error('Request failed after %s retries.',
                              _CONNECTION_ERROR_RETRY_LIMIT)
                raise

    if response['state'] in ('FAILED', 'CANCELLED'):
        err_msg = 'Job \'{}\' did not succeed.  Detailed response {}.'.format(
            job_name, response)
        logging.error(err_msg)
        raise RuntimeError(err_msg)

    # AIP training complete
    logging.info('Job \'%s\' successful.', job_name)
Ejemplo n.º 12
0
def main(argv):
    del argv  # Unused.

    # Initialise Tink
    try:
        aead.register()
    except tink.TinkError as e:
        logging.error('Error initialising Tink: %s', e)
        return 1

    if FLAGS.mode == 'generate':
        # [START generate-a-new-keyset]
        # Generate a new keyset
        try:
            key_template = aead.aead_key_templates.AES128_GCM
            keyset_handle = tink.KeysetHandle.generate_new(key_template)
        except tink.TinkError as e:
            logging.exception('Error creating primitive: %s', e)
            return 1
        # [END generate-a-new-keyset]

        # [START store-a-cleartext-keyset]
        with open(FLAGS.keyset_path, 'wt') as keyset_file:
            try:
                cleartext_keyset_handle.write(
                    tink.JsonKeysetWriter(keyset_file), keyset_handle)
            except tink.TinkError as e:
                logging.exception('Error writing key: %s', e)
                return 1
        return 0
        # [END store-a-cleartext-keyset]

    # Use the input keyset to encrypt/decrypt data

    # Read the keyset into a keyset_handle
    with open(FLAGS.keyset_path, 'rt') as keyset_file:
        try:
            text = keyset_file.read()
            keyset_handle = cleartext_keyset_handle.read(
                tink.JsonKeysetReader(text))
        except tink.TinkError as e:
            logging.exception('Error reading key: %s', e)
            return 1

    # Get the primitive
    try:
        cipher = keyset_handle.primitive(aead.Aead)
    except tink.TinkError as e:
        logging.error('Error creating primitive: %s', e)
        return 1

    with open(FLAGS.input_path, 'rb') as input_file:
        input_data = input_file.read()
        if FLAGS.mode == 'decrypt':
            output_data = cipher.decrypt(input_data, b'envelope_example')
        elif FLAGS.mode == 'encrypt':
            output_data = cipher.encrypt(input_data, b'envelope_example')
        else:
            logging.error(
                'Error mode not supported. Please choose "encrypt" or "decrypt".'
            )
            return 1

        with open(FLAGS.output_path, 'wb') as output_file:
            output_file.write(output_data)
Ejemplo n.º 13
0
def _model_fn(features, labels, mode, params, model, variable_filter_fn=None):
    """Model definition entry.

  Args:
    features: the input image tensor with shape [batch_size, height, width, 3].
      The height and width are fixed and equal.
    labels: the input labels in a dictionary. The labels include class targets
      and box targets which are dense label maps. The labels are generated from
      get_input_fn function in data/dataloader.py
    mode: the mode of TPUEstimator including TRAIN, EVAL, and PREDICT.
    params: the dictionary defines hyperparameters of model. The default
      settings are in default_hparams function in this file.
    model: the model outputs class logits and box regression outputs.
    variable_filter_fn: the filter function that takes trainable_variables and
      returns the variable list after applying the filter rule.

  Returns:
    tpu_spec: the TPUEstimatorSpec to run training, evaluation, or prediction.

  Raises:
    RuntimeError: if both ckpt and backbone_ckpt are set.
  """
    utils.image('input_image', features)
    training_hooks = []
    params['is_training_bn'] = (mode == tf.estimator.ModeKeys.TRAIN)

    if params['use_keras_model']:

        def model_fn(inputs):
            model = efficientdet_keras.EfficientDetNet(
                config=hparams_config.Config(params))
            cls_out_list, box_out_list = model(inputs,
                                               params['is_training_bn'])
            cls_outputs, box_outputs = {}, {}
            for i in range(params['min_level'], params['max_level'] + 1):
                cls_outputs[i] = cls_out_list[i - params['min_level']]
                box_outputs[i] = box_out_list[i - params['min_level']]
            return cls_outputs, box_outputs
    else:
        model_fn = functools.partial(model,
                                     config=hparams_config.Config(params))

    precision = utils.get_precision(params['strategy'],
                                    params['mixed_precision'])
    cls_outputs, box_outputs = utils.build_model_with_precision(
        precision, model_fn, features, params['is_training_bn'])

    levels = cls_outputs.keys()
    for level in levels:
        cls_outputs[level] = tf.cast(cls_outputs[level], tf.float32)
        box_outputs[level] = tf.cast(box_outputs[level], tf.float32)

    # First check if it is in PREDICT mode.
    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'image': features,
        }
        for level in levels:
            predictions['cls_outputs_%d' % level] = cls_outputs[level]
            predictions['box_outputs_%d' % level] = box_outputs[level]
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)

    # Set up training loss and learning rate.
    update_learning_rate_schedule_parameters(params)
    global_step = tf.train.get_or_create_global_step()
    learning_rate = learning_rate_schedule(params, global_step)

    # cls_loss and box_loss are for logging. only total_loss is optimized.
    det_loss, cls_loss, box_loss, box_iou_loss = detection_loss(
        cls_outputs, box_outputs, labels, params)
    reg_l2loss = reg_l2_loss(params['weight_decay'])
    total_loss = det_loss + reg_l2loss

    if mode == tf.estimator.ModeKeys.TRAIN:
        utils.scalar('lrn_rate', learning_rate)
        utils.scalar('trainloss/cls_loss', cls_loss)
        utils.scalar('trainloss/box_loss', box_loss)
        utils.scalar('trainloss/det_loss', det_loss)
        utils.scalar('trainloss/reg_l2_loss', reg_l2loss)
        utils.scalar('trainloss/loss', total_loss)
        if params['iou_loss_type']:
            utils.scalar('trainloss/box_iou_loss', box_iou_loss)
        train_epochs = tf.cast(global_step,
                               tf.float32) / params['steps_per_epoch']
        utils.scalar('train_epochs', train_epochs)

    moving_average_decay = params['moving_average_decay']
    if moving_average_decay:
        ema = tf.train.ExponentialMovingAverage(decay=moving_average_decay,
                                                num_updates=global_step)
        ema_vars = utils.get_ema_vars()

    if mode == tf.estimator.ModeKeys.TRAIN:
        if params['optimizer'].lower() == 'sgd':
            optimizer = tf.train.MomentumOptimizer(learning_rate,
                                                   momentum=params['momentum'])
        elif params['optimizer'].lower() == 'adam':
            optimizer = tf.train.AdamOptimizer(learning_rate)
        else:
            raise ValueError('optimizers should be adam or sgd')

        if params['strategy'] == 'tpu':
            optimizer = tf.tpu.CrossShardOptimizer(optimizer)
        if params['gradient_checkpointing']:
            from third_party.grad_checkpoint import memory_saving_gradients  # pylint: disable=import-outside-toplevel
            from tensorflow.python.ops import gradients  # pylint: disable=import-outside-toplevel

            # monkey patch tf.gradients to point to our custom version,
            # with automatic checkpoint selection
            def gradients_(ys, xs, grad_ys=None, **kwargs):
                return memory_saving_gradients.gradients(
                    ys,
                    xs,
                    grad_ys,
                    checkpoints=params['gradient_checkpointing_list'],
                    **kwargs)

            gradients.__dict__["gradients"] = gradients_

        # Batch norm requires update_ops to be added as a train_op dependency.
        update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        var_list = tf.trainable_variables()
        if variable_filter_fn:
            var_list = variable_filter_fn(var_list)

        if params.get('clip_gradients_norm', None):
            logging.info('clip gradients norm by %f',
                         params['clip_gradients_norm'])
            grads_and_vars = optimizer.compute_gradients(total_loss, var_list)
            with tf.name_scope('clip'):
                grads = [gv[0] for gv in grads_and_vars]
                tvars = [gv[1] for gv in grads_and_vars]
                # First clip each variable's norm, then clip global norm.
                clip_norm = abs(params['clip_gradients_norm'])
                clipped_grads = [
                    tf.clip_by_norm(g, clip_norm) if g is not None else None
                    for g in grads
                ]
                clipped_grads, _ = tf.clip_by_global_norm(
                    clipped_grads, clip_norm)
                utils.scalar('gradient_norm',
                             tf.linalg.global_norm(clipped_grads))
                grads_and_vars = list(zip(clipped_grads, tvars))

            with tf.control_dependencies(update_ops):
                train_op = optimizer.apply_gradients(grads_and_vars,
                                                     global_step)
        else:
            with tf.control_dependencies(update_ops):
                train_op = optimizer.minimize(total_loss,
                                              global_step,
                                              var_list=var_list)

        if moving_average_decay:
            with tf.control_dependencies([train_op]):
                train_op = ema.apply(ema_vars)

    else:
        train_op = None

    eval_metrics = None
    if mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(**kwargs):
            """Returns a dictionary that has the evaluation metrics."""
            if params['nms_configs'].get('pyfunc', True):
                detections_bs = []
                for index in range(kwargs['boxes'].shape[0]):
                    nms_configs = params['nms_configs']
                    detections = tf.numpy_function(
                        functools.partial(nms_np.per_class_nms,
                                          nms_configs=nms_configs),
                        [
                            kwargs['boxes'][index],
                            kwargs['scores'][index],
                            kwargs['classes'][index],
                            tf.slice(kwargs['image_ids'], [index], [1]),
                            tf.slice(kwargs['image_scales'], [index], [1]),
                            params['num_classes'],
                            nms_configs['max_output_size'],
                        ], tf.float32)
                    detections_bs.append(detections)
                detections_bs = postprocess.transform_detections(
                    tf.stack(detections_bs))
            else:
                # These two branches should be equivalent, but currently they are not.
                # TODO(tanmingxing): enable the non_pyfun path after bug fix.
                nms_boxes, nms_scores, nms_classes, _ = postprocess.per_class_nms(
                    params, kwargs['boxes'], kwargs['scores'],
                    kwargs['classes'], kwargs['image_scales'])
                img_ids = tf.cast(tf.expand_dims(kwargs['image_ids'], -1),
                                  nms_scores.dtype)
                detections_bs = [
                    img_ids * tf.ones_like(nms_scores),
                    nms_boxes[:, :, 1],
                    nms_boxes[:, :, 0],
                    nms_boxes[:, :, 3] - nms_boxes[:, :, 1],
                    nms_boxes[:, :, 2] - nms_boxes[:, :, 0],
                    nms_scores,
                    nms_classes,
                ]
                detections_bs = tf.stack(detections_bs,
                                         axis=-1,
                                         name='detnections')

            if params.get('testdev_dir', None):
                logging.info('Eval testdev_dir %s', params['testdev_dir'])
                eval_metric = coco_metric.EvaluationMetric(
                    testdev_dir=params['testdev_dir'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, tf.zeros([1]))
            else:
                logging.info('Eval val with groudtruths %s.',
                             params['val_json_file'])
                eval_metric = coco_metric.EvaluationMetric(
                    filename=params['val_json_file'],
                    label_map=params['label_map'])
                coco_metrics = eval_metric.estimator_metric_fn(
                    detections_bs, kwargs['groundtruth_data'])

            # Add metrics to output.
            cls_loss = tf.metrics.mean(kwargs['cls_loss_repeat'])
            box_loss = tf.metrics.mean(kwargs['box_loss_repeat'])
            output_metrics = {
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            }
            output_metrics.update(coco_metrics)
            return output_metrics

        cls_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(cls_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])
        box_loss_repeat = tf.reshape(
            tf.tile(tf.expand_dims(box_loss, 0), [
                params['batch_size'],
            ]), [params['batch_size'], 1])

        cls_outputs = postprocess.to_list(cls_outputs)
        box_outputs = postprocess.to_list(box_outputs)
        params['nms_configs']['max_nms_inputs'] = anchors.MAX_DETECTION_POINTS
        boxes, scores, classes = postprocess.pre_nms(params, cls_outputs,
                                                     box_outputs)
        metric_fn_inputs = {
            'cls_loss_repeat': cls_loss_repeat,
            'box_loss_repeat': box_loss_repeat,
            'image_ids': labels['source_ids'],
            'groundtruth_data': labels['groundtruth_data'],
            'image_scales': labels['image_scales'],
            'boxes': boxes,
            'scores': scores,
            'classes': classes,
        }
        eval_metrics = (metric_fn, metric_fn_inputs)

    checkpoint = params.get('ckpt') or params.get('backbone_ckpt')

    if checkpoint and mode == tf.estimator.ModeKeys.TRAIN:
        # Initialize the model from an EfficientDet or backbone checkpoint.
        if params.get('ckpt') and params.get('backbone_ckpt'):
            raise RuntimeError(
                '--backbone_ckpt and --checkpoint are mutually exclusive')

        if params.get('backbone_ckpt'):
            var_scope = params['backbone_name'] + '/'
            if params['ckpt_var_scope'] is None:
                # Use backbone name as default checkpoint scope.
                ckpt_scope = params['backbone_name'] + '/'
            else:
                ckpt_scope = params['ckpt_var_scope'] + '/'
        else:
            # Load every var in the given checkpoint
            var_scope = ckpt_scope = '/'

        def scaffold_fn():
            """Loads pretrained model through scaffold function."""
            logging.info('restore variables from %s', checkpoint)

            var_map = utils.get_ckpt_var_map(
                ckpt_path=checkpoint,
                ckpt_scope=ckpt_scope,
                var_scope=var_scope,
                skip_mismatch=params['skip_mismatch'])

            tf.train.init_from_checkpoint(checkpoint, var_map)
            return tf.train.Scaffold()
    elif mode == tf.estimator.ModeKeys.EVAL and moving_average_decay:

        def scaffold_fn():
            """Load moving average variables for eval."""
            logging.info('Load EMA vars with ema_decay=%f',
                         moving_average_decay)
            restore_vars_dict = ema.variables_to_restore(ema_vars)
            saver = tf.train.Saver(restore_vars_dict)
            return tf.train.Scaffold(saver=saver)
    else:
        scaffold_fn = None

    if params['strategy'] != 'tpu':
        # Profile every 1K steps.
        if params.get('profile', False):
            profile_hook = tf.estimator.ProfilerHook(
                save_steps=1000,
                output_dir=params['model_dir'],
                show_memory=True)
            training_hooks.append(profile_hook)

            # Report memory allocation if OOM
            class OomReportingHook(tf.estimator.SessionRunHook):
                def before_run(self, run_context):
                    return tf.estimator.SessionRunArgs(
                        fetches=[],
                        options=tf.RunOptions(
                            report_tensor_allocations_upon_oom=True))

            training_hooks.append(OomReportingHook())

        logging_hook = tf.estimator.LoggingTensorHook(
            {
                'step': global_step,
                'det_loss': det_loss,
                'cls_loss': cls_loss,
                'box_loss': box_loss,
            },
            every_n_iter=params.get('iterations_per_loop', 100),
        )
        training_hooks.append(logging_hook)

        if params["nvgpu_logging"]:
            try:
                from third_party.tools.nvgpu import gpu_memory_util_message  # pylint: disable=import-outside-toplevel

                mem_message = tf.py_func(gpu_memory_util_message, [],
                                         [tf.string])[0]

                logging_hook_nvgpu = tf.estimator.LoggingTensorHook(
                    tensors={
                        "mem_message": mem_message,
                    },
                    every_n_iter=params.get('iterations_per_loop', 100),
                    formatter=lambda x: x["mem_message"].decode("utf-8"),
                )
                training_hooks.append(logging_hook_nvgpu)
            except:
                logging.error("nvgpu error: nvidia-smi format not recognized")

    if params['strategy'] == 'tpu':
        return tf.estimator.tpu.TPUEstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metrics=eval_metrics,
            host_call=utils.get_tpu_host_call(global_step, params),
            scaffold_fn=scaffold_fn,
            training_hooks=training_hooks)
    else:
        eval_metric_ops = (eval_metrics[0](
            **eval_metrics[1]) if eval_metrics else None)
        utils.get_tpu_host_call(global_step, params)
        return tf.estimator.EstimatorSpec(
            mode=mode,
            loss=total_loss,
            train_op=train_op,
            eval_metric_ops=eval_metric_ops,
            scaffold=scaffold_fn() if scaffold_fn else None,
            training_hooks=training_hooks)
Ejemplo n.º 14
0
def collect_trajectories(env,
                         policy_net_apply,
                         policy_net_params,
                         num_trajectories=1,
                         policy="greedy",
                         epsilon=0.1):
  """Collect trajectories with the given policy net and behaviour."""
  trajectories = []

  for _ in range(num_trajectories):
    rewards = []
    actions = []
    done = False

    observation = env.reset()

    # This is currently shaped (1, 1) + OBS, but new observations will keep
    # getting added to it, making it eventually (1, T+1) + OBS
    observation_history = observation[np.newaxis, np.newaxis, :]

    while not done:
      # Run the policy, to pick an action, shape is (1, t, A) because
      # observation_history is shaped (1, t) + OBS
      predictions = policy_net_apply(policy_net_params, observation_history)

      # We need the predictions for the last time-step, so squeeze the batch
      # dimension and take the last time-step.
      predictions = np.squeeze(predictions, axis=0)[-1]

      # Policy can be run in one of the following ways:
      #  - Greedy
      #  - Epsilon-Greedy
      #  - Categorical-Sampling
      action = None
      if policy == "greedy":
        action = np.argmax(predictions)
      elif policy == "epsilon-greedy":
        # A schedule for epsilon is 1/k where k is the episode number sampled.
        if onp.random.random() < epsilon:
          # Choose an action at random.
          action = onp.random.randint(0, high=len(predictions))
        else:
          # Return the best action.
          action = np.argmax(predictions)
      elif policy == "categorical-sampling":
        action = onp.argwhere(onp.random.multinomial(1, predictions) == 1)
      else:
        raise ValueError("Unknown policy: %s" % policy)

      # NOTE: Assumption, single batch.
      try:
        action = int(action)
      except TypeError as err:
        # Let's dump some information before we die off.
        logging.error("Cannot convert action into an integer: [%s]", err)
        logging.error("action.shape: [%s]", action.shape)
        logging.error("action: [%s]", action)
        logging.error("predictions.shape: [%s]", predictions.shape)
        logging.error("predictions: [%s]", predictions)
        logging.error("observation_history: [%s]", observation_history)
        logging.error("policy_net_params: [%s]", policy_net_params)
        log_params(policy_net_params, "policy_net_params")
        raise err

      observation, reward, done, _ = env.step(action)

      # observation is of shape OBS, so add extra dims and concatenate on the
      # time dimension.
      observation_history = np.concatenate(
          [observation_history, observation[np.newaxis, np.newaxis, :]], axis=1)

      rewards.append(reward)
      actions.append(action)

    # This means we are done
    assert done
    # observation_history is (1, T+1) + OBS, lets squeeze out the batch dim.
    observation_history = np.squeeze(observation_history, axis=0)
    trajectories.append(
        (observation_history, np.stack(actions), np.stack(rewards)))

  return trajectories
Ejemplo n.º 15
0
def main(argv):
    logging.info(f"Starting MAML training with {FLAGS.source} dataset.")
    ckpt_save_path = os.path.join(FLAGS.save_path, "ckpts")
    os.makedirs(ckpt_save_path, exist_ok=True)

    logging.info(f"Setting seed...")
    torch_utils.set_seed(FLAGS.seed)

    metadata = [f.serialize() for f in FLAGS.get_key_flags_for_module(sys.argv[0])]
    metadata = [m for m in metadata if m]  # remove empty flags
    metadata = "\n\t" + "\n\t".join(metadata)
    logging.info(f"Current parameters: {metadata}")

    flag_file = os.path.join(ckpt_save_path, "flagfile.txt")
    FLAGS.flags_into_string()
    FLAGS.append_flags_into_file(flag_file)
    logging.info(f"Flags are stored to {flag_file}")

    logging.info("Loading data...")
    loaders = dataloaders.get_loaders(
        source_path=FLAGS.source, inner_batch_size=FLAGS.inner_batch_size
    )

    logging.info("Instantiating model and optimizers...")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = GatedGraphNeuralNetwork(
        n_edge=1, in_dim=75, n_conv=FLAGS.n_conv, fc_dims=[FLAGS.fc_dims, 1], p_dropout=0.0
    )
    if FLAGS.init_path is not None:
        logging.info(f"Loading initializations from {FLAGS.init_path}")
        model = torch.load(FLAGS.init_path)
    model = model.to(device)
    meta_learner = MAML(model, lr=FLAGS.inner_lr, first_order=FLAGS.first_order, anil=FLAGS.anil)

    optimizer = optim.Adam(meta_learner.parameters(), FLAGS.meta_lr)
    if FLAGS.mode == "binary_classification":
        pos_weight = torch.tensor(
            [l.dataset.y.sum() / len(l.dataset.y) for l in loaders["meta_train"]["train"]]
        ).mean()
        criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    elif FLAGS.mode == "regression":
        criterion = nn.MSELoss()
    else:
        logging.error(f"--mode {FLAGS.mode} is not supported. Choose from ['binary_classification', 'regression'].")
        sys.exit(1)

    metrics = FLAGS.metrics.split(",") if FLAGS.metrics else []

    logging.info(f"Begin training!")
    meta_training(
        meta_learner=meta_learner,
        meta_steps=FLAGS.meta_steps,
        meta_batch_size=FLAGS.meta_batch_size,
        loaders=loaders,
        optimizer=optimizer,
        criterion=criterion,
        inner_steps=FLAGS.inner_steps,
        device=device,
        save_path=ckpt_save_path,
        ckpt_steps=FLAGS.ckpt_steps,
        metrics=metrics,
    )
Ejemplo n.º 16
0
    def _wait_for_processes(self, wait_processes, kill_processes,
                            timeout_secs):
        """Waits until all `wait_processes` finish, then kills `kill_processes`.

    Fails an assert if a process in `wait_processes` finishes unsuccessfully.
    The processes in `kill_processes` are assumed to never finish so they are
    killed.

    Args:
      wait_processes: A list of _ProcessInfo tuples. This function will wait for
        each to finish.
      kill_processes: A list of _ProcessInfo tuples. Each will be killed once
        every process in `wait_processes` is finished.
      timeout_secs: Seconds to wait before timing out and terminating processes.

    Returns:
      A list of strings, each which is a string of the stderr of a wait process.

    Raises:
      Exception: When waiting for tasks to finish times out.
    """

        timer = _CountDownTimer(timeout_secs)
        wait_process_stderrs = [None] * len(wait_processes)
        finished_wait_processes = set()
        poll_count = {wait_process: 0.0 for wait_process in wait_processes}

        while len(finished_wait_processes) < len(wait_processes):
            if timer.secs_remaining() == 0:
                logging.error(
                    "Timed out! Outputting logs of unfinished processes:")
                for i, wait_process in enumerate(wait_processes):
                    if i in finished_wait_processes:
                        continue
                    wait_process.stderr.seek(0)
                    wait_process_stderrs[i] = wait_process.stderr.read()
                    logging.info(
                        "stderr for incomplete %s (last %d chars): %s\n",
                        wait_process.name, MAX_OUTPUT_CHARS,
                        wait_process_stderrs[i][-MAX_OUTPUT_CHARS:])
                raise Exception("Timed out waiting for tasks to complete.")
            for i, wait_process in enumerate(wait_processes):
                if i in finished_wait_processes:
                    continue
                ret_code = wait_process.popen.poll()
                if ret_code is None:
                    poll_count[wait_process] += 0.25
                    if ((poll_count[wait_process] / 10.) -
                            int(poll_count[wait_process] / 10.)) == 0:
                        logging.info("%d secs has elapsed for %s",
                                     poll_count[wait_process],
                                     wait_process.name)
                    continue
                logging.info("%s finished", wait_process.name)
                wait_process.stderr.seek(0)
                wait_process_stderrs[i] = wait_process.stderr.read()
                logging.info("stderr for %s (last %d chars): %s\n",
                             wait_process.name, MAX_OUTPUT_CHARS,
                             wait_process_stderrs[i][-MAX_OUTPUT_CHARS:])
                self.assertEqual(0, ret_code)
                finished_wait_processes.add(i)
            for kill_process in kill_processes:
                ret_code = kill_process.popen.poll()
                # Kill processes should not end until we kill them.
                # If it returns early, note the return code.
                if ret_code is not None:
                    logging.error("kill process %s ended with ret_code %d",
                                  kill_process.name, ret_code)
                    kill_process.stderr.seek(0)
                    logging.info(
                        "stderr for %s (last %d chars): %s\n",
                        kill_process.name, MAX_OUTPUT_CHARS,
                        kill_process.stderr.read()[-MAX_OUTPUT_CHARS:])
                    self.assertIsNone(ret_code)
            # Delay between polling loops.
            time.sleep(0.25)
        logging.info("All wait processes finished")
        for i, kill_process in enumerate(kill_processes):
            # Kill each kill process.
            kill_process.popen.kill()
            kill_process.popen.wait()
            kill_process.stderr.seek(0)
            logging.info("stderr for %s (last %d chars): %s\n",
                         kill_process.name, MAX_OUTPUT_CHARS,
                         kill_process.stderr.read()[-MAX_OUTPUT_CHARS:])
        return wait_process_stderrs
Ejemplo n.º 17
0
def run_ncf(_):
  """Run NCF training and eval with Keras."""

  keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

  if FLAGS.seed is not None:
    print("Setting tf seed")
    tf.random.set_seed(FLAGS.seed)

  model_helpers.apply_clean(FLAGS)

  if FLAGS.dtype == "fp16" and FLAGS.fp16_implementation == "keras":
    policy = tf.keras.mixed_precision.experimental.Policy(
        "mixed_float16",
        loss_scale=flags_core.get_loss_scale(FLAGS, default_for_fp16="dynamic"))
    tf.keras.mixed_precision.experimental.set_policy(policy)

  strategy = distribution_utils.get_distribution_strategy(
      distribution_strategy=FLAGS.distribution_strategy,
      num_gpus=FLAGS.num_gpus,
      tpu_address=FLAGS.tpu)

  params = ncf_common.parse_flags(FLAGS)
  params["distribute_strategy"] = strategy

  if not keras_utils.is_v2_0() and strategy is not None:
    logging.error("NCF Keras only works with distribution strategy in TF 2.0")
    return
  if (params["keras_use_ctl"] and (
      not keras_utils.is_v2_0() or strategy is None)):
    logging.error(
        "Custom training loop only works with tensorflow 2.0 and dist strat.")
    return
  if params["use_tpu"] and not params["keras_use_ctl"]:
    logging.error("Custom training loop must be used when using TPUStrategy.")
    return

  batch_size = params["batch_size"]
  time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
  callbacks = [time_callback]

  producer, input_meta_data = None, None
  generate_input_online = params["train_dataset_path"] is None

  if generate_input_online:
    # Start data producing thread.
    num_users, num_items, _, _, producer = ncf_common.get_inputs(params)
    producer.start()
    per_epoch_callback = IncrementEpochCallback(producer)
    callbacks.append(per_epoch_callback)
  else:
    assert params["eval_dataset_path"] and params["input_meta_data_path"]
    with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
      input_meta_data = json.loads(reader.read().decode("utf-8"))
      num_users = input_meta_data["num_users"]
      num_items = input_meta_data["num_items"]

  params["num_users"], params["num_items"] = num_users, num_items

  if FLAGS.early_stopping:
    early_stopping_callback = CustomEarlyStopping(
        "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
    callbacks.append(early_stopping_callback)

  (train_input_dataset, eval_input_dataset,
   num_train_steps, num_eval_steps) = \
    (ncf_input_pipeline.create_ncf_input_data(
        params, producer, input_meta_data, strategy))
  steps_per_epoch = None if generate_input_online else num_train_steps

  with distribution_utils.get_strategy_scope(strategy):
    keras_model = _get_keras_model(params)
    optimizer = tf.keras.optimizers.Adam(
        learning_rate=params["learning_rate"],
        beta_1=params["beta1"],
        beta_2=params["beta2"],
        epsilon=params["epsilon"])
    if FLAGS.fp16_implementation == "graph_rewrite":
      optimizer = \
        tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(
            optimizer,
            loss_scale=flags_core.get_loss_scale(FLAGS,
                                                 default_for_fp16="dynamic"))
    elif FLAGS.dtype == "fp16" and params["keras_use_ctl"]:
      # When keras_use_ctl is False, instead Model.fit() automatically applies
      # loss scaling so we don't need to create a LossScaleOptimizer.
      optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(
          optimizer,
          tf.keras.mixed_precision.experimental.global_policy().loss_scale)

    if params["keras_use_ctl"]:
      train_loss, eval_results = run_ncf_custom_training(
          params,
          strategy,
          keras_model,
          optimizer,
          callbacks,
          train_input_dataset,
          eval_input_dataset,
          num_train_steps,
          num_eval_steps,
          generate_input_online=generate_input_online)
    else:
      keras_model.compile(optimizer=optimizer, run_eagerly=FLAGS.run_eagerly)

      if not FLAGS.ml_perf:
        # Create Tensorboard summary and checkpoint callbacks.
        summary_dir = os.path.join(FLAGS.model_dir, "summaries")
        summary_callback = tf.keras.callbacks.TensorBoard(summary_dir)
        checkpoint_path = os.path.join(FLAGS.model_dir, "checkpoint")
        checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
            checkpoint_path, save_weights_only=True)

        callbacks += [summary_callback, checkpoint_callback]

      history = keras_model.fit(
          train_input_dataset,
          epochs=FLAGS.train_epochs,
          steps_per_epoch=steps_per_epoch,
          callbacks=callbacks,
          validation_data=eval_input_dataset,
          validation_steps=num_eval_steps,
          verbose=2)

      logging.info("Training done. Start evaluating")

      eval_loss_and_metrics = keras_model.evaluate(
          eval_input_dataset, steps=num_eval_steps, verbose=2)

      logging.info("Keras evaluation is done.")

      # Keras evaluate() API returns scalar loss and metric values from
      # evaluation as a list. Here, the returned list would contain
      # [evaluation loss, hr sum, hr count].
      eval_hit_rate = eval_loss_and_metrics[1] / eval_loss_and_metrics[2]

      # Format evaluation result into [eval loss, eval hit accuracy].
      eval_results = [eval_loss_and_metrics[0], eval_hit_rate]

      if history and history.history:
        train_history = history.history
        train_loss = train_history["loss"][-1]

  stats = build_stats(train_loss, eval_results, time_callback)
  return stats
Ejemplo n.º 18
0
    def generate(self) -> List[task_lib.Task]:
        """Generates tasks for executing the next executable nodes in the pipeline.

    The returned tasks must have `exec_task` populated. List may be empty if
    no nodes are ready for execution.

    Returns:
      A `list` of tasks to execute.
    """
        layers = topsort.topsorted_layers(
            [node.pipeline_node for node in self._pipeline.nodes],
            get_node_id_fn=lambda node: node.node_info.id,
            get_parent_nodes=(
                lambda node: [self._node_map[n] for n in node.upstream_nodes]),
            get_child_nodes=(
                lambda node:
                [self._node_map[n] for n in node.downstream_nodes]))
        result = []
        for layer_num, nodes in enumerate(layers):
            # Boolean that's set if there's at least one successfully executed node
            # in the current layer.
            completed_node_ids = set()
            for node in nodes:
                node_uid = task_lib.NodeUid.from_pipeline_node(
                    self._pipeline, node)
                node_id = node.node_info.id
                if self._service_job_manager.is_pure_service_node(
                        self._pipeline_state, node.node_info.id):
                    if not self._upstream_nodes_executed(node):
                        continue
                    service_status = self._service_job_manager.ensure_node_services(
                        self._pipeline_state, node_id)
                    if service_status == service_jobs.ServiceStatus.SUCCESS:
                        logging.info('Service node completed successfully: %s',
                                     node_uid)
                        completed_node_ids.add(node_id)
                    elif service_status == service_jobs.ServiceStatus.FAILED:
                        logging.error('Failed service node: %s', node_uid)
                        return [
                            task_lib.FinalizePipelineTask(
                                pipeline_uid=self._pipeline_state.pipeline_uid,
                                status=status_lib.Status(
                                    code=status_lib.Code.ABORTED,
                                    message=
                                    (f'Aborting pipeline execution due to service '
                                     f'node failure; failed node uid: {node_uid}'
                                     )))
                        ]
                    else:
                        logging.info('Pure service node in progress: %s',
                                     node_uid)
                    continue

                # If a task for the node is already tracked by the task queue, it need
                # not be considered for generation again.
                if self._is_task_id_tracked_fn(
                        task_lib.exec_node_task_id_from_pipeline_node(
                            self._pipeline, node)):
                    continue
                node_executions = task_gen_utils.get_executions(
                    self._mlmd_handle, node)
                if task_gen_utils.is_latest_execution_successful(
                        node_executions):
                    completed_node_ids.add(node_id)
                    continue
                # If all upstream nodes are executed but current node is not executed,
                # the node is deemed ready for execution.
                if self._upstream_nodes_executed(node):
                    task = self._generate_task(node, node_executions)
                    if task_lib.is_finalize_pipeline_task(task):
                        return [task]
                    else:
                        result.append(task)
            # If there are no completed nodes in the current layer, downstream nodes
            # need not be checked.
            if not completed_node_ids:
                break
            # If all nodes in the final layer are completed successfully , the
            # pipeline can be finalized.
            # TODO(goutham): If there are conditional eval nodes, not all nodes may be
            # executed in the final layer. Handle this case when conditionals are
            # supported.
            if layer_num == len(layers) - 1 and completed_node_ids == set(
                    node.node_info.id for node in nodes):
                return [
                    task_lib.FinalizePipelineTask(
                        pipeline_uid=self._pipeline_state.pipeline_uid,
                        status=status_lib.Status(code=status_lib.Code.OK))
                ]
        return result
Ejemplo n.º 19
0
    def _check_same(ref: Any, tar: Any, rtol: float, atol: float) -> bool:
        """Checks that ref and tar have identical datastructures and values."""
        # Check for matching types.
        if not isinstance(tar, type(ref)):
            logging.error(
                "Expected ref and tar to have the same type but got '%s' and '%s'",
                type(ref), type(tar))
            return False

        if ref is None:
            # Nothing to compare (e.g. the called method had no outputs).
            return True

        # Recursive check for dicts.
        if isinstance(ref, dict):
            if ref.keys() != tar.keys():
                logging.error(
                    "Expected ref and tar to have the same keys, but got '%s' and '%s'",
                    ref.keys(), tar.keys())
                return False
            # Check that all of the dictionaries' values are the same.
            for key in ref:
                if not Trace._check_same(ref[key], tar[key], rtol, atol):
                    return False

        # Recursive check for iterables.
        elif isinstance(ref, list) or isinstance(ref, tuple):
            if len(ref) != len(tar):
                logging.error(
                    "Expected ref and tar to have the same length, but got %s and %s",
                    len(ref), len(tar))
                return False
            # Check that all of the iterables' values are the same.
            for i in range(len(ref)):
                if not Trace._check_same(ref[i], tar[i], rtol, atol):
                    return False

        # Base check for numpy arrays.
        elif isinstance(ref, np.ndarray):
            if ref.dtype != tar.dtype:
                logging.error(
                    "Expected ref and tar to have the same dtype, but got %s  and %s",
                    ref.dtype, tar.dtype)
                return False
            if np.issubdtype(ref.dtype, np.floating):
                same = np.allclose(ref, tar, rtol=rtol, atol=atol)
                if not same:
                    abs_diff = np.max(np.abs(ref - tar))
                    rel_diff = np.max(np.abs(ref - tar) / np.max(tar))
                    logging.error(
                        "Floating point difference between ref and tar was too large. "
                        "Max abs diff: %s, atol: %s, max relative diff: %s, rtol: %s",
                        abs_diff, atol, rel_diff, rtol)
                return same
            else:
                return np.array_equal(ref, tar)

        # Base check for native number types.
        elif isinstance(ref, (int, float)):
            return ref == tar

        # If outputs end up here then an extra branch for that type should be added.
        else:
            raise TypeError(
                f"Encountered results with unexpected type {type(ref)}")
        return True
Ejemplo n.º 20
0
def setup_project(config, project_yaml, output_yaml_path):
  """Run the full process for initalizing a single new project.

  Note: for projects that have already been deployed, only the updatable steps
  will be run.

  Args:
    config (ProjectConfig): The config of a single project to setup.
    project_yaml (str): Path of the project config YAML.
    output_yaml_path (str): Path to output resulting root config in JSON.

  Returns:
    A boolean, true if the project was deployed successfully, false otherwise.
  """
  project_id = config.project['project_id']
  steps = _SETUP_STEPS + config.extra_steps

  starting_step = field_generation.get_generated_fields_copy(
      project_id, config.root).get('failed_step', 1)

  deployed = field_generation.is_deployed(project_id, config.root)

  total_steps = len(steps)
  for step_num in range(starting_step, total_steps + 1):
    step = steps[step_num - 1]
    project_id = config.project['project_id']
    logging.info('%s: step %d/%d (%s)', project_id, step_num, total_steps,
                 step.description)

    if deployed and not step.updatable:
      logging.info('Step %d is not updatable, skipping', step_num)
      continue

    try:
      step.func(config)
    except Exception as e:  # pylint: disable=broad-except
      traceback.print_exc()
      logging.error('%s: setup failed on step %s: %s', project_id, step_num, e)
      logging.error(
          'Failure information has been written to --output_yaml_path. '
          'Please ensure the config at --project_yaml is updated with any '
          'changes from the config at --output_yaml_path and re-run the script'
          '(Note: only applicable if --output_yaml_path != --project_yaml)')

      # only record failed step if project was undeployed, an update can always
      # start from the beginning
      if not deployed:
        field_generation.get_generated_fields_ref(
            project_id, config.root)['failed_step'] = step_num
        field_generation.rewrite_generated_fields_back(project_yaml,
                                                       output_yaml_path,
                                                       config.root)

      return False

    field_generation.rewrite_generated_fields_back(project_yaml,
                                                   output_yaml_path,
                                                   config.root)

  # if this deployment was resuming from a previous failure, remove the
  # failed step as it is done
  if field_generation.is_generated_fields_exist(project_id, config.root):
    field_generation.get_generated_fields_ref(project_id, config.root,
                                              False).pop('failed_step', None)
  field_generation.rewrite_generated_fields_back(project_yaml, output_yaml_path,
                                                 config.root)
  logging.info('Setup completed successfully.')

  return True
Ejemplo n.º 21
0
 def answer(self, msg, error=False) -> None:
   logging.error(msg)
   self.write(json.dumps({'msg': msg, 'error': error}))
Ejemplo n.º 22
0
def main(argv):
  del argv  # Unused.

  if FLAGS.enable_new_style_resources:
    logging.info('--enable_new_style_resources is true.')

  FLAGS.output_yaml_path = utils.normalize_path(FLAGS.output_yaml_path)
  if FLAGS.output_rules_path:
    FLAGS.output_rules_path = utils.normalize_path(FLAGS.output_rules_path)

  FLAGS.project_yaml = utils.normalize_path(FLAGS.project_yaml)

  if FLAGS.enable_new_style_resources:
    config_string = runner.run_command([
        FLAGS.load_config_binary,
        '--config_path',
        FLAGS.project_yaml,
    ],
                                       get_output=True)
    yaml = ruamel.yaml.YAML()
    root_config = yaml.load(config_string)
  else:
    root_config = utils.load_config(FLAGS.project_yaml)

  if not root_config:
    logging.error('Error loading project YAML.')
    return

  logging.info('Validating project YAML against schema.')
  try:
    utils.validate_config_yaml(root_config)
  except jsonschema.exceptions.ValidationError as e:
    logging.error('Error in YAML config: %s', e)
    return

  want_projects = set(FLAGS.projects)

  def want_project(project_config_dict):
    if not project_config_dict:
      return False

    return want_projects == {
        '*'
    } or project_config_dict['project_id'] in want_projects

  projects = []
  audit_logs_project = root_config.get('audit_logs_project')

  # Always deploy the remote audit logs project first (if present).
  if want_project(audit_logs_project):
    projects.append(
        ProjectConfig(
            root=root_config,
            project=audit_logs_project,
            audit_logs_project=None,
            extra_steps=[]))

  forseti_config = root_config.get('forseti')

  if forseti_config and want_project(forseti_config['project']):
    extra_steps = [
        Step(
            func=install_forseti,
            description='Install Forseti',
            updatable=False,
        ),
        get_forseti_access_granter_step(
            forseti_config['project']['project_id']),
    ]

    if audit_logs_project:
      extra_steps.append(
          get_forseti_access_granter_step(audit_logs_project['project_id']))

    forseti_project_config = ProjectConfig(
        root=root_config,
        project=forseti_config['project'],
        audit_logs_project=audit_logs_project,
        extra_steps=extra_steps)
    projects.append(forseti_project_config)

  for project_config in root_config.get('projects', []):
    if not want_project(project_config):
      continue

    extra_steps = []
    if forseti_config:
      extra_steps.append(
          get_forseti_access_granter_step(project_config['project_id']))

    projects.append(
        ProjectConfig(
            root=root_config,
            project=project_config,
            audit_logs_project=audit_logs_project,
            extra_steps=extra_steps))

  validate_project_configs(root_config['overall'], projects)

  logging.info('Found %d projects to deploy', len(projects))

  for config in projects:
    logging.info('Setting up project %s', config.project['project_id'])

    if not setup_project(config, FLAGS.project_yaml, FLAGS.output_yaml_path):
      # Don't attempt to deploy additional projects if one project failed.
      return

  if forseti_config:
    if FLAGS.enable_new_style_resources:
      call = [
          FLAGS.rule_generator_binary,
          '--project_yaml_path',
          FLAGS.project_yaml,
          '--output_path',
          FLAGS.output_rules_path or '',
      ]
      logging.info('Running rule generator: %s', call)
      utils.call_go_binary(call)
    else:
      rule_generator.run(root_config, output_path=FLAGS.output_rules_path)

  logging.info(
      'All projects successfully deployed. Please remember to sync '
      'any changes written to the config at --output_yaml_path with '
      '--project_yaml before running the script again (Note: only applicable '
      'if --output_yaml_path != --project_yaml)')
Ejemplo n.º 23
0
        HHblits default: False.
      alt: Show up to this many alternative alignments.
      p: Minimum Prob for a hit to be included in the output hhr file.
        HHblits default: 20.
      z: Hard cap on number of hits reported in the hhr file.
        HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.

    Raises:
      RuntimeError: If HHblits binary not found within the path.
    """
        self.binary_path = binary_path
        self.databases = databases

        for database_path in self.databases:
            if not glob.glob(database_path + '_*'):
                logging.error('Could not find HHBlits database %s',
                              database_path)
                raise ValueError(
                    f'Could not find HHBlits database {database_path}')

        self.n_cpu = n_cpu
        self.n_iter = n_iter
        self.e_value = e_value
        self.maxseq = maxseq
        self.realign_max = realign_max
        self.maxfilt = maxfilt
        self.min_prefilter_hits = min_prefilter_hits
        self.all_seqs = all_seqs
        self.alt = alt
        self.p = p
        self.z = z
# ==============================================================================

import csv
import sys
from absl import logging


def to_standard_format(input_file, output_file):
  logging.info("Save file to {}".format(output_file))

  with open(input_file, encoding="utf-8") as csv_file, \
    open(output_file, "w", encoding="utf-8") as out_file:
    csv_reader = csv.reader(csv_file)
    for row in csv_reader:
      if len(row) < 4:
        continue
      label = row[0]
      text = " ".join(row[1:])
      out_file.write(label + "\t" + text + "\n")


if __name__ == '__main__':
  logging.set_verbosity(logging.INFO)
  if len(sys.argv) != 3:
    logging.error("Usage {} input_file output_file".format(sys.argv[0]))
    sys.exit(-1)

  input_file = sys.argv[1]
  output_file = sys.argv[2]
  to_standard_format(input_file, output_file)
Ejemplo n.º 25
0
def main(_):
  logging_verbosity = logging_level_verbosity(FLAGS.logging_verbosity)
  logging.set_verbosity(logging_verbosity)

  logging.error('WARNING: This tool is deprecated in favor of '
                'https://github.com/tensorflow/hub/tree/master/'
                'tensorflow_hub/tools/make_image_classifier')

  if not FLAGS.image_dir:
    logging.error('Must set flag --image_dir.')
    return -1

  prepare_file_system()

  image_lists = create_image_lists(FLAGS.image_dir, FLAGS.testing_percentage,
                                   FLAGS.validation_percentage)
  class_count = len(image_lists.keys())
  if class_count == 0:
    logging.error('No valid folders of images found at %s', FLAGS.image_dir)
    return -1
  if class_count == 1:
    logging.error('Only one valid folder of images found at %s '
                  ' - multiple classes are needed for classification.',
                  FLAGS.image_dir)
    return -1

  do_distort_images = should_distort_images(
      FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
      FLAGS.random_brightness)

  module_spec = hub.load_module_spec(FLAGS.tfhub_module)
  graph, bottleneck_tensor, resized_image_tensor, wants_quantization = (
      create_module_graph(module_spec))

  with graph.as_default():
    (train_step, cross_entropy, bottleneck_input,
     ground_truth_input, final_tensor) = add_final_retrain_ops(
         class_count, FLAGS.final_tensor_name, bottleneck_tensor,
         wants_quantization, is_training=True)

  with tf.Session(graph=graph) as sess:
    init = tf.global_variables_initializer()
    sess.run(init)

    jpeg_data_tensor, decoded_image_tensor = add_jpeg_decoding(module_spec)

    if do_distort_images:
      (distorted_jpeg_data_tensor,
       distorted_image_tensor) = add_input_distortions(
           FLAGS.flip_left_right, FLAGS.random_crop, FLAGS.random_scale,
           FLAGS.random_brightness, module_spec)
    else:
      cache_bottlenecks(sess, image_lists, FLAGS.image_dir,
                        FLAGS.bottleneck_dir, jpeg_data_tensor,
                        decoded_image_tensor, resized_image_tensor,
                        bottleneck_tensor, FLAGS.tfhub_module)

    evaluation_step, _ = add_evaluation_step(final_tensor, ground_truth_input)

    merged = tf.summary.merge_all()
    train_writer = tf.summary.FileWriter(FLAGS.summaries_dir + '/train',
                                         sess.graph)

    validation_writer = tf.summary.FileWriter(
        FLAGS.summaries_dir + '/validation')

    train_saver = tf.train.Saver()

    for i in range(FLAGS.how_many_training_steps):
      if do_distort_images:
        (train_bottlenecks,
         train_ground_truth) = get_random_distorted_bottlenecks(
             sess, image_lists, FLAGS.train_batch_size, 'training',
             FLAGS.image_dir, distorted_jpeg_data_tensor,
             distorted_image_tensor, resized_image_tensor, bottleneck_tensor)
      else:
        (train_bottlenecks,
         train_ground_truth, _) = get_random_cached_bottlenecks(
             sess, image_lists, FLAGS.train_batch_size, 'training',
             FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
             decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
             FLAGS.tfhub_module)
      train_summary, _ = sess.run(
          [merged, train_step],
          feed_dict={bottleneck_input: train_bottlenecks,
                     ground_truth_input: train_ground_truth})
      train_writer.add_summary(train_summary, i)

      is_last_step = (i + 1 == FLAGS.how_many_training_steps)
      if (i % FLAGS.eval_step_interval) == 0 or is_last_step:
        train_accuracy, cross_entropy_value = sess.run(
            [evaluation_step, cross_entropy],
            feed_dict={bottleneck_input: train_bottlenecks,
                       ground_truth_input: train_ground_truth})
        logging.info('%s: Step %d: Train accuracy = %.1f%%',
                     datetime.now(), i, train_accuracy * 100)
        logging.info('%s: Step %d: Cross entropy = %f',
                     datetime.now(), i, cross_entropy_value)
        
		validation_bottlenecks, validation_ground_truth, _ = (
            get_random_cached_bottlenecks(
                sess, image_lists, FLAGS.validation_batch_size, 'validation',
                FLAGS.bottleneck_dir, FLAGS.image_dir, jpeg_data_tensor,
                decoded_image_tensor, resized_image_tensor, bottleneck_tensor,
                FLAGS.tfhub_module))
        validation_summary, validation_accuracy = sess.run(
            [merged, evaluation_step],
            feed_dict={bottleneck_input: validation_bottlenecks,
                       ground_truth_input: validation_ground_truth})
        validation_writer.add_summary(validation_summary, i)
        logging.info('%s: Step %d: Validation accuracy = %.1f%% (N=%d)',
                     datetime.now(), i, validation_accuracy * 100,
                     len(validation_bottlenecks))

      intermediate_frequency = FLAGS.intermediate_store_frequency

      if (intermediate_frequency > 0 and (i % intermediate_frequency == 0)
          and i > 0):
        train_saver.save(sess, FLAGS.checkpoint_path)
        intermediate_file_name = (FLAGS.intermediate_output_graphs_dir +
                                  'intermediate_' + str(i) + '.pb')
        logging.info('Save intermediate result to : %s', intermediate_file_name)
        save_graph_to_file(intermediate_file_name, module_spec,
                           class_count)

    train_saver.save(sess, FLAGS.checkpoint_path)

    run_final_eval(sess, module_spec, class_count, image_lists,
                   jpeg_data_tensor, decoded_image_tensor, resized_image_tensor,
                   bottleneck_tensor)

    logging.info('Save final result to : %s', FLAGS.output_graph)
    if wants_quantization:
      logging.info('The model is instrumented for quantization with TF-Lite')
    save_graph_to_file(FLAGS.output_graph, module_spec, class_count)
    with tf.gfile.GFile(FLAGS.output_labels, 'w') as f:
      f.write('\n'.join(image_lists.keys()) + '\n')

    if FLAGS.saved_model_dir:
      export_model(module_spec, class_count, FLAGS.saved_model_dir)
Ejemplo n.º 26
0
def run_ncf(_):
    """Run NCF training and eval with Keras."""

    keras_utils.set_session_config(enable_xla=FLAGS.enable_xla)

    if FLAGS.seed is not None:
        print("Setting tf seed")
        tf.random.set_seed(FLAGS.seed)

    # TODO(seemuch): Support different train and eval batch sizes
    if FLAGS.eval_batch_size != FLAGS.batch_size:
        logging.warning(
            "The Keras implementation of NCF currently does not support batch_size "
            "!= eval_batch_size ({} vs. {}). Overriding eval_batch_size to match "
            "batch_size".format(FLAGS.eval_batch_size, FLAGS.batch_size))
        FLAGS.eval_batch_size = FLAGS.batch_size

    params = ncf_common.parse_flags(FLAGS)
    model_helpers.apply_clean(flags.FLAGS)

    strategy = distribution_utils.get_distribution_strategy(
        distribution_strategy=FLAGS.distribution_strategy,
        num_gpus=FLAGS.num_gpus)
    params["distribute_strategy"] = strategy

    if not keras_utils.is_v2_0() and strategy is not None:
        logging.error(
            "NCF Keras only works with distribution strategy in TF 2.0")
        return

    if (params["keras_use_ctl"]
            and (not keras_utils.is_v2_0() or strategy is None)):
        logging.error(
            "Custom training loop only works with tensorflow 2.0 and dist strat."
        )
        return

    # ncf_common rounds eval_batch_size (this is needed due to a reshape during
    # eval). This carries over that rounding to batch_size as well. This is the
    # per device batch size
    params["batch_size"] = params["eval_batch_size"]
    batch_size = params["batch_size"]

    time_callback = keras_utils.TimeHistory(batch_size, FLAGS.log_steps)
    callbacks = [time_callback]

    producer, input_meta_data = None, None
    generate_input_online = params["train_dataset_path"] is None

    if generate_input_online:
        # Start data producing thread.
        num_users, num_items, num_train_steps, num_eval_steps, producer = (
            ncf_common.get_inputs(params))
        producer.start()
        per_epoch_callback = IncrementEpochCallback(producer)
        callbacks.append(per_epoch_callback)
    else:
        assert params["eval_dataset_path"] and params["input_meta_data_path"]
        with tf.io.gfile.GFile(params["input_meta_data_path"], "rb") as reader:
            input_meta_data = json.loads(reader.read().decode("utf-8"))
            num_users = input_meta_data["num_users"]
            num_items = input_meta_data["num_items"]

    params["num_users"], params["num_items"] = num_users, num_items
    (train_input_dataset, eval_input_dataset, num_train_steps, num_eval_steps) = \
        (ncf_input_pipeline.create_ncf_input_data(
            params, producer, input_meta_data))
    steps_per_epoch = None if generate_input_online else num_train_steps

    if FLAGS.early_stopping:
        early_stopping_callback = CustomEarlyStopping(
            "val_HR_METRIC", desired_value=FLAGS.hr_threshold)
        callbacks.append(early_stopping_callback)
    with distribution_utils.get_strategy_scope(strategy):
        keras_model = _get_keras_model(params)
        optimizer = tf.keras.optimizers.Adam(
            learning_rate=params["learning_rate"],
            beta_1=params["beta1"],
            beta_2=params["beta2"],
            epsilon=params["epsilon"])

    if params["keras_use_ctl"]:
        loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
            reduction="sum", from_logits=True)
        train_input_iterator = strategy.make_dataset_iterator(
            train_input_dataset)
        eval_input_iterator = strategy.make_dataset_iterator(
            eval_input_dataset)

        @tf.function
        def train_step():
            """Called once per step to train the model."""
            def step_fn(features):
                """Computes loss and applied gradient per replica."""
                with tf.GradientTape() as tape:
                    softmax_logits = keras_model(features)
                    labels = features[rconst.TRAIN_LABEL_KEY]
                    loss = loss_object(
                        labels,
                        softmax_logits,
                        sample_weight=features[rconst.VALID_POINT_MASK])
                    loss *= (1.0 /
                             (batch_size * strategy.num_replicas_in_sync))

                grads = tape.gradient(loss, keras_model.trainable_variables)
                # Converting gradients to dense form helps in perf on GPU for NCF
                grads = neumf_model.sparse_to_dense_grads(
                    list(zip(grads, keras_model.trainable_variables)))
                optimizer.apply_gradients(grads)
                return loss

            per_replica_losses = strategy.experimental_run(
                step_fn, train_input_iterator)
            mean_loss = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                        per_replica_losses,
                                        axis=None)
            return mean_loss

        @tf.function
        def eval_step():
            """Called once per eval step to compute eval metrics."""
            def step_fn(features):
                """Computes eval metrics per replica."""
                softmax_logits = keras_model(features)
                in_top_k, metric_weights = metric_fn(
                    softmax_logits, features[rconst.DUPLICATE_MASK], params)
                hr_sum = tf.reduce_sum(in_top_k * metric_weights)
                hr_count = tf.reduce_sum(metric_weights)
                return hr_sum, hr_count

            per_replica_hr_sum, per_replica_hr_count = (
                strategy.experimental_run(step_fn, eval_input_iterator))
            hr_sum = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                     per_replica_hr_sum,
                                     axis=None)
            hr_count = strategy.reduce(tf.distribute.ReduceOp.SUM,
                                       per_replica_hr_count,
                                       axis=None)
            return hr_sum, hr_count

        time_callback.on_train_begin()
        for epoch in range(FLAGS.train_epochs):
            for cb in callbacks:
                cb.on_epoch_begin(epoch)

            # As NCF dataset is sampled with randomness, not repeating
            # data elements in each epoch has significant impact on
            # convergence. As so, offline-generated TF record files
            # contains all epoch worth of data. Thus we do not need
            # to initialize dataset when reading from tf record files.
            if generate_input_online:
                train_input_iterator.initialize()

            train_loss = 0
            for step in range(num_train_steps):
                time_callback.on_batch_begin(step + epoch * num_train_steps)
                train_loss += train_step()
                time_callback.on_batch_end(step + epoch * num_train_steps)
            train_loss /= num_train_steps
            logging.info("Done training epoch %s, epoch loss=%s.", epoch + 1,
                         train_loss)
            eval_input_iterator.initialize()
            hr_sum = 0
            hr_count = 0
            for _ in range(num_eval_steps):
                step_hr_sum, step_hr_count = eval_step()
                hr_sum += step_hr_sum
                hr_count += step_hr_count
            logging.info("Done eval epoch %s, hr=%s.", epoch + 1,
                         hr_sum / hr_count)

            if (FLAGS.early_stopping
                    and float(hr_sum / hr_count) > params["hr_threshold"]):
                break

        time_callback.on_train_end()
        eval_results = [None, hr_sum / hr_count]

    else:
        with distribution_utils.get_strategy_scope(strategy):
            # TODO(b/138957587): Remove when force_v2_in_keras_compile is on longer
            # a valid arg for this model. Also remove as a valid flag.
            if FLAGS.force_v2_in_keras_compile is not None:
                keras_model.compile(optimizer=optimizer,
                                    run_eagerly=FLAGS.run_eagerly,
                                    experimental_run_tf_function=FLAGS.
                                    force_v2_in_keras_compile)
            else:
                keras_model.compile(optimizer=optimizer,
                                    run_eagerly=FLAGS.run_eagerly)

            history = keras_model.fit(train_input_dataset,
                                      epochs=FLAGS.train_epochs,
                                      steps_per_epoch=steps_per_epoch,
                                      callbacks=callbacks,
                                      validation_data=eval_input_dataset,
                                      validation_steps=num_eval_steps,
                                      verbose=2)

            logging.info("Training done. Start evaluating")

            eval_results = keras_model.evaluate(eval_input_dataset,
                                                steps=num_eval_steps,
                                                verbose=2)

            logging.info("Keras evaluation is done.")

        if history and history.history:
            train_history = history.history
            train_loss = train_history["loss"][-1]

    stats = build_stats(train_loss, eval_results, time_callback)
    return stats
Ejemplo n.º 27
0
def main(argv):
  if len(argv) != 3 and len(argv) != 5:
    raise app.UsageError(
        'Invalid arguments.\n'
        'Usage: %s generate key-file.\n'
        'Usage: %s encrypt/decrypt key-file '
        'input-file output-file.' % (argv[0], argv[0])
        )

  mode = argv[1]
  if mode not in ('encrypt', 'decrypt', 'generate'):
    raise app.UsageError(
        'The first argument should be either encrypt, decrypt or generate')

  key_file_path = argv[2]
  input_file_path = argv[3] if len(argv) == 5 else None
  output_file_path = argv[4] if len(argv) == 5 else None

  # Initialise Tink
  try:
    aead.register()
  except tink.TinkError as e:
    logging.error('Error initialising Tink: %s', e)
    return 1

  if mode == 'generate':
    # [START generate-a-new-keyset]
    # Generate a new keyset
    try:
      key_template = aead.aead_key_templates.AES128_GCM
      keyset_handle = tink.KeysetHandle.generate_new(key_template)
    except tink.TinkError as e:
      logging.exception('Error creating primitive: %s', e)
      return 1
    # [END generate-a-new-keyset]

    # [START store-a-cleartext-keyset]
    with open(key_file_path, 'wt') as keyset_file:
      try:
        cleartext_keyset_handle.write(
            tink.JsonKeysetWriter(keyset_file), keyset_handle)
      except tink.TinkError as e:
        logging.exception('Error writing key: %s', e)
        return 1
    return 0
    # [END store-a-cleartext-keyset]

  # Use the input keyset to encrypt/decrypt data

  # Read the keyset into a keyset_handle
  with open(key_file_path, 'rt') as keyset_file:
    try:
      text = keyset_file.read()
      keyset_handle = cleartext_keyset_handle.read(tink.JsonKeysetReader(text))
    except tink.TinkError as e:
      logging.exception('Error reading key: %s', e)
      return 1

  # Get the primitive
  try:
    cipher = keyset_handle.primitive(aead.Aead)
  except tink.TinkError as e:
    logging.error('Error creating primitive: %s', e)
    return 1

  with open(input_file_path, 'rb') as input_file:
    input_data = input_file.read()
    if mode == 'decrypt':
      output_data = cipher.decrypt(input_data, b'envelope_example')
    elif mode == 'encrypt':
      output_data = cipher.encrypt(input_data, b'envelope_example')
    else:
      logging.error(
          'Error mode not supported. Please choose "encrypt" or "decrypt".')
      return 1

    with open(output_file_path, 'wb') as output_file:
      output_file.write(output_data)
Ejemplo n.º 28
0
    def launch(self) -> Optional[data_types.ExecutionInfo]:
        """Executes the component, includes driver, executor and publisher.

    Returns:
      The metadata of this execution that is registered in MLMD. It can be None
      if the driver decides not to run the execution.

    Raises:
      Exception: If the executor fails.
    """
        logging.info('Running launcher for %s', self._pipeline_node)
        if self._system_node_handler:
            # If this is a system node, runs it and directly return.
            return self._system_node_handler.run(self._mlmd_connection,
                                                 self._pipeline_node,
                                                 self._pipeline_info,
                                                 self._pipeline_runtime_spec)

        # Runs as a normal node.
        execution_preparation_result = self._prepare_execution()
        (execution_info, contexts, is_execution_needed) = (
            execution_preparation_result.execution_info,
            execution_preparation_result.contexts,
            execution_preparation_result.is_execution_needed)
        if is_execution_needed:
            try:
                executor_watcher = None
                if self._executor_operator:
                    # Create an execution watcher and save an in memory copy of the
                    # Execution object to execution to it. Launcher calls executor
                    # operator in process, thus there won't be race condition between the
                    # execution watcher and the launcher to write to MLMD.
                    executor_watcher = execution_watcher.ExecutionWatcher(
                        port=portpicker.pick_unused_port(),
                        mlmd_connection=self._mlmd_connection,
                        execution=execution_preparation_result.
                        execution_metadata,
                        creds=grpc.local_server_credentials())
                    self._executor_operator.with_execution_watcher(
                        executor_watcher.address)
                    executor_watcher.start()
                executor_output = self._run_executor(execution_info)
            except Exception as e:  # pylint: disable=broad-except
                execution_output = (e.executor_output if isinstance(
                    e, _ExecutionFailedError) else None)
                self._publish_failed_execution(execution_info.execution_id,
                                               contexts, execution_output)
                logging.error('Execution %d failed.',
                              execution_info.execution_id)
                raise
            finally:
                self._clean_up_stateless_execution_info(execution_info)
                if executor_watcher:
                    executor_watcher.stop()

            logging.info('Execution %d succeeded.',
                         execution_info.execution_id)
            self._clean_up_stateful_execution_info(execution_info)

            # TODO(b/182316162): Unify publisher handing so that post-execution
            # artifact logic is more cleanly handled.
            # Note that currently both the ExecutionInfo and ExecutorOutput are
            # consulted in `execution_publish_utils.publish_succeeded_execution()`.
            outputs_utils.tag_executor_output_with_version(executor_output)
            outputs_utils.tag_output_artifacts_with_version(
                execution_info.output_dict)
            logging.info('Publishing output artifacts %s for execution %s',
                         execution_info.output_dict,
                         execution_info.execution_id)
            self._publish_successful_execution(execution_info.execution_id,
                                               contexts,
                                               execution_info.output_dict,
                                               executor_output)
        return execution_info
Ejemplo n.º 29
0
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Loads icp op."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl import logging
import tensorflow as tf

try:
    icp_op_module = tf.load_op_library('./ops/icp_op.so')
    icp = icp_op_module.icp
except Exception:  # pylint: disable=broad-except
    try:
        icp_op_module = tf.load_op_library('./icp_op.so')
        icp = icp_op_module.icp
    except Exception:
        logging.error('Could not load object file for ICP op.')
        icp = None
Ejemplo n.º 30
0
def main(argv):
    if len(argv) > 1:
        raise app.UsageError('Too many command-line arguments.')

    # Report failed checks as they occur and maintain a counter, instead of
    # raising exceptions right away, so all issues can be reported at once.
    num_failed_checks = 0

    # Load dataset_spec, this should fail if it is absent or incorrect.
    if FLAGS.dataset_spec_file is None:
        dataset_spec = dataset_spec_lib.load_dataset_spec(
            FLAGS.dataset_records_path)
    else:
        with tf.io.gfile.GFile(FLAGS.dataset_spec_file, 'r') as f:
            dataset_spec = json.load(
                f, object_hook=dataset_spec_lib.as_dataset_spec)

    dataset_spec.initialize()

    # 1. Check dataset name
    dir_name = os.path.basename(os.path.abspath(FLAGS.dataset_records_path))
    if dataset_spec.name != dir_name:
        num_failed_checks += 1
        logging.error(
            'The dataset name in "dataset_spec.json" (%s) does not match '
            'the name of the directory containing it (%s)', dataset_spec.name,
            dir_name)

    # 2. Check name and number of .tfrecords files
    num_classes = len(dataset_spec.class_names)
    try:
        expected_filenames = [
            dataset_spec.file_pattern.format(class_id)
            for class_id in range(num_classes)
        ]
    except IndexError:
        num_failed_checks += 1
        err_msg = (
            'The `file_pattern` (%s) did not accept the class number as its only '
            'formatting argument. Using the default (%s).')
        default_pattern = '{}.tfrecords'
        logging.error(err_msg, dataset_spec.file_pattern, default_pattern)

        expected_filenames = [
            default_pattern.format(class_id) for class_id in range(num_classes)
        ]

    all_filenames = tf.io.gfile.listdir(FLAGS.dataset_records_path)
    # Heuristic to exclude obviously-not-tfrecords files.
    tfrecords_filenames = [
        f for f in all_filenames if 'tfrecords' in f.lower()
    ]

    expected_set = set(expected_filenames)
    present_set = set(tfrecords_filenames)
    if set(expected_set) != set(present_set):

        num_failed_checks += 1
        logging.error(
            'The tfrecords files in %s do not match the dataset_spec.\n'
            'Unexpected files present:\n'
            '%s\n'
            'Expected files not present:\n'
            '%s', FLAGS.dataset_records_path,
            sorted(present_set - expected_set),
            sorted(expected_set - present_set))

    # Iterate through each dataset, count examples and check set of targets.
    # List of (class_id, expected_count, actual_count) triples.
    bad_counts = []
    # List of (filename, class_id, labels).
    bad_labels = []

    for class_id, filename in enumerate(expected_filenames):
        expected_count = dataset_spec.get_total_images_per_class(class_id)
        if filename not in tfrecords_filenames:
            # The tfrecords does not exist, we use a negative count to denote it.
            bad_counts.append((class_id, expected_count, -1))
            bad_labels.append((filename, class_id, set()))
            continue
        full_filepath = os.path.join(FLAGS.dataset_records_path, filename)

        try:
            count, labels = get_count_and_labels(full_filepath,
                                                 FLAGS.label_field_name)
        except tf.errors.InvalidArgumentError:
            logging.exception(
                'Unable to find label (%s) in the tf.Examples of file %s. '
                'Maybe try a different --label_field_name.',
                FLAGS.label_field_name, filename)
            # Fall back to counting examples only.
            count = count_records(full_filepath)
            labels = set()
        if count != expected_count:
            bad_counts.append((class_id, expected_count, count))
        if labels != {class_id}:
            # labels could include class_id among other, incorrect labels.
            bad_labels.append((filename, class_id, labels))

    # 3. Check number of examples
    if bad_counts:
        num_failed_checks += 1
        logging.error(
            'The number of tfrecords in the following files do not match '
            'the expected number of examples in that class.\n'
            '(filename, expected, actual)  # -1 denotes a missing file.\n'
            '%s', bad_counts)

    # 4. Check the targets stored in the tfrecords files.
    if bad_labels:
        num_failed_checks += 1
        logging.error(
            'The labels stored inside the tfrecords (in field %s) do not '
            'all match the expected value (class_id).\n'
            '(filename, class_id, values)\n'
            '%s', FLAGS.label_field_name, bad_labels)

    # Report results
    if num_failed_checks:
        raise ValueError('%d checks failed. See the error-level logs.' %
                         num_failed_checks)
Ejemplo n.º 31
0
  def _ProcessHost(self, d):
    """Retrieves recovery data from an LDAP host and escrows to CauliflowerVest.

    Args:
      d: a single ldap.conn.result3() result dictionary.
    Raises:
      InvalidDistinguishedName: the given host had an invalid DN.
      InvalidGuid: the given host had an invalid GUID.
    """
    dn = d['distinguishedName'][0]
    # Parse the hostname out of the distinguishedName, which is in this format:
    #   CN=<timestamp>{<recovery_guid>},CN=<hostname>,OU=Workstations,...
    hostname = dn.split(',')[1][len('CN='):]

    # Ignore records with legacy DNs, as they have invalid RecoveryGUIDs,
    # and all have separate valid records.
    if INVALID_DN_REGEX.search(dn):
      raise InvalidDistinguishedName(dn)

    # Some msFVE-RecoveryGuid values may be invalid, so carefully attempt to
    # contruct the recovery_guid, and skip over objects which are invalid.
    try:
      recovery_guid = str(
          uuid.UUID(bytes_le=d['msFVE-RecoveryGuid'][0])).upper()
      volume_guid = str(
          uuid.UUID(bytes_le=d['msFVE-VolumeGuid'][0])).upper()
    except ValueError:
      raise InvalidGuid(
          '%s: %s' % (hostname, d['msFVE-RecoveryGuid']))

    if FLAGS.redact_recovery_passwords:
      recovery_password = '******'
    else:
      recovery_password = d['msFVE-RecoveryPassword'][0]

    when_created = d['whenCreated'][0]
    try:
      datetime.datetime.strptime(when_created, '%Y%m%d%H%M%S.0Z')
    except ValueError:
      logging.error('Unknown whenCreated format: %r', when_created)
      when_created = ''

    parent_guid = None
    # msFVE-RecoveryObject distinguishedName is in the form of:
    #   CN=<TIMESTAMP>{<UUID>},CN=<HOSTNAME>,DC=example,DC=com
    # where CN=<HOSTNAME>,.* is the parent's distinguishedName.
    # Given that the the msFVE-RecoveryObject is a child of the parent host,
    # split off the child to obtain the parent's DN.
    parent_dn = dn.split(',', 1)[1]
    # Alternatively:  parent_dn = dn.replace('CN=%s,' % d['name'][0], '')
    ldap_filter = '(&(objectCategory=computer))'
    for host in self._QueryLdap(parent_dn, ldap_filter, scope=ldap.SCOPE_BASE):
      parent_guid = str(uuid.UUID(bytes_le=host['objectGUID'][0])).upper()

    metadata = {
        'hostname': hostname,
        'dn': dn,
        'when_created': when_created,
        'parent_guid': parent_guid,
        'recovery_guid': recovery_guid,
    }

    self.client.UploadPassphrase(volume_guid, recovery_password, metadata)
    logging.info('Escrowed recovery password: %r', volume_guid)
Ejemplo n.º 32
0
def run_customized_training_loop(
        # pylint: disable=invalid-name
        _sentinel=None,
        # pylint: enable=invalid-name
        strategy=None,
        model_fn=None,
        loss_fn=None,
        model_dir=None,
        train_input_fn=None,
        steps_per_epoch=None,
        steps_per_loop=1,
        epochs=1,
        eval_input_fn=None,
        eval_steps=None,
        metric_fn=None,
        init_checkpoint=None,
        custom_callbacks=None,
        run_eagerly=False,
        sub_model_export_name=None):
    """Run BERT pretrain model training using low-level API.

  Arguments:
      _sentinel: Used to prevent positional parameters. Internal, do not use.
      strategy: Distribution strategy on which to run low level training loop.
      model_fn: Function that returns a tuple (model, sub_model). Caller of this
        function should add optimizer to the `model` via calling
        `model.compile()` API or manually setting `model.optimizer` attribute.
        Second element of the returned tuple(sub_model) is an optional sub model
        to be used for initial checkpoint -- if provided.
      loss_fn: Function with signature func(labels, logits) and returns a loss
        tensor.
      model_dir: Model directory used during training for restoring/saving model
        weights.
      train_input_fn: Function that returns a tf.data.Dataset used for training.
      steps_per_epoch: Number of steps to run per epoch. At the end of each
        epoch, model checkpoint will be saved and evaluation will be conducted
        if evaluation dataset is provided.
      steps_per_loop: Number of steps per graph-mode loop. In order to reduce
        communication in eager context, training logs are printed every
        steps_per_loop.
      epochs: Number of epochs to train.
      eval_input_fn: Function that returns evaluation dataset. If none,
        evaluation is skipped.
      eval_steps: Number of steps to run evaluation. Required if `eval_input_fn`
        is not none.
      metric_fn: A metrics function that returns a Keras Metric object to record
        evaluation result using evaluation dataset or with training dataset
        after every epoch.
      init_checkpoint: Optional checkpoint to load to `sub_model` returned by
        `model_fn`.
      custom_callbacks: A list of Keras Callbacks objects to run during
        training. More specifically, `on_batch_begin()`, `on_batch_end()`,
        methods are invoked during training.
      run_eagerly: Whether to run model training in pure eager execution. This
        should be disable for TPUStrategy.
      sub_model_export_name: If not None, will export `sub_model` returned by
        `model_fn` into checkpoint files. The name of intermediate checkpoint
        file is {sub_model_export_name}_step_{step}.ckpt and the last
        checkpint's name is {sub_model_export_name}.ckpt;
        if None, `sub_model` will not be exported as checkpoint.

  Returns:
      Trained model.

  Raises:
      ValueError: (1) When model returned by `model_fn` does not have optimizer
        attribute or when required parameters are set to none. (2) eval args are
        not specified correctly. (3) metric_fn must be a callable if specified.
        (4) sub_model_checkpoint_name is specified, but `sub_model` returned
        by `model_fn` is None.
  """

    if _sentinel is not None:
        raise ValueError('only call `run_customized_training_loop()` '
                         'with named arguments.')

    required_arguments = [
        strategy, model_fn, loss_fn, model_dir, steps_per_epoch, train_input_fn
    ]
    if [arg for arg in required_arguments if arg is None]:
        raise ValueError('`strategy`, `model_fn`, `loss_fn`, `model_dir`, '
                         '`steps_per_loop` and `steps_per_epoch` are required '
                         'parameters.')
    if steps_per_loop > steps_per_epoch:
        logging.error(
            'steps_per_loop: %d is specified to be greater than '
            ' steps_per_epoch: %d, we will use steps_per_epoch as'
            ' steps_per_loop.', steps_per_loop, steps_per_epoch)
        steps_per_loop = steps_per_epoch
    assert tf.executing_eagerly()

    if run_eagerly:
        if steps_per_loop > 1:
            raise ValueError(
                'steps_per_loop is used for performance optimization. When you want '
                'to run eagerly, you cannot leverage graph mode loop.')
        if isinstance(strategy, tf.distribute.experimental.TPUStrategy):
            raise ValueError(
                'TPUStrategy should not run eagerly as it heavily replies on graph'
                ' optimization for the distributed system.')

    if eval_input_fn and (eval_steps is None or metric_fn is None):
        raise ValueError(
            '`eval_step` and `metric_fn` are required when `eval_input_fn ` '
            'is not none.')
    if metric_fn and not callable(metric_fn):
        raise ValueError(
            'if `metric_fn` is specified, metric_fn must be a callable.')

    total_training_steps = steps_per_epoch * epochs

    # To reduce unnecessary send/receive input pipeline operation, we place input
    # pipeline ops in worker task.
    train_iterator = _get_input_iterator(train_input_fn, strategy)

    with distribution_utils.get_strategy_scope(strategy):
        # To correctly place the model weights on accelerators,
        # model and optimizer should be created in scope.
        model, sub_model = model_fn()
        if not hasattr(model, 'optimizer'):
            raise ValueError('User should set optimizer attribute to model '
                             'inside `model_fn`.')
        if sub_model_export_name and sub_model is None:
            raise ValueError('sub_model_export_name is specified as %s, but '
                             'sub_model is None.' % sub_model_export_name)

        optimizer = model.optimizer
        use_float16 = isinstance(
            optimizer,
            tf.keras.mixed_precision.experimental.LossScaleOptimizer)

        if init_checkpoint:
            logging.info(
                'Checkpoint file %s found and restoring from '
                'initial checkpoint for core model.', init_checkpoint)
            checkpoint = tf.train.Checkpoint(model=sub_model)
            checkpoint.restore(
                init_checkpoint).assert_existing_objects_matched()
            logging.info('Loading from checkpoint file completed')

        train_loss_metric = tf.keras.metrics.Mean('training_loss',
                                                  dtype=tf.float32)
        eval_metrics = [metric_fn()] if metric_fn else []
        # If evaluation is required, make a copy of metric as it will be used by
        # both train and evaluation.
        train_metrics = [
            metric.__class__.from_config(metric.get_config())
            for metric in eval_metrics
        ]

        # Create summary writers
        summary_dir = os.path.join(model_dir, 'summaries')
        eval_summary_writer = tf.summary.create_file_writer(
            os.path.join(summary_dir, 'eval'))
        if steps_per_loop >= _MIN_SUMMARY_STEPS:
            # Only writes summary when the stats are collected sufficiently over
            # enough steps.
            train_summary_writer = tf.summary.create_file_writer(
                os.path.join(summary_dir, 'train'))
        else:
            train_summary_writer = None

        # Collects training variables.
        training_vars = model.trainable_variables

        def _replicated_step(inputs):
            """Replicated training step."""

            inputs, labels = inputs
            with tf.GradientTape() as tape:
                model_outputs = model(inputs, training=True)
                loss = loss_fn(labels, model_outputs)
                if use_float16:
                    scaled_loss = optimizer.get_scaled_loss(loss)

            if use_float16:
                scaled_grads = tape.gradient(scaled_loss, training_vars)
                grads = optimizer.get_unscaled_gradients(scaled_grads)
            else:
                grads = tape.gradient(loss, training_vars)
            optimizer.apply_gradients(zip(grads, training_vars))
            # For reporting, the metric takes the mean of losses.
            train_loss_metric.update_state(loss)
            for metric in train_metrics:
                metric.update_state(labels, model_outputs)

        @tf.function
        def train_steps(iterator, steps):
            """Performs distributed training steps in a loop.

      Args:
        iterator: the distributed iterator of training datasets.
        steps: an tf.int32 integer tensor to specify number of steps to run
          inside host training loop.

      Raises:
        ValueError: Any of the arguments or tensor shapes are invalid.
      """
            if not isinstance(steps, tf.Tensor):
                raise ValueError(
                    'steps should be an Tensor. Python object may cause '
                    'retracing.')

            for _ in tf.range(steps):
                strategy.experimental_run_v2(_replicated_step,
                                             args=(next(iterator), ))

        def train_single_step(iterator):
            """Performs a distributed training step.

      Args:
        iterator: the distributed iterator of training datasets.

      Raises:
        ValueError: Any of the arguments or tensor shapes are invalid.
      """
            strategy.experimental_run_v2(_replicated_step,
                                         args=(next(iterator), ))

        def test_step(iterator):
            """Calculates evaluation metrics on distributed devices."""
            def _test_step_fn(inputs):
                """Replicated accuracy calculation."""

                inputs, labels = inputs
                model_outputs = model(inputs, training=False)
                for metric in eval_metrics:
                    metric.update_state(labels, model_outputs)

            strategy.experimental_run_v2(_test_step_fn,
                                         args=(next(iterator), ))

        if not run_eagerly:
            train_single_step = tf.function(train_single_step)
            test_step = tf.function(test_step)

        def _run_evaluation(current_training_step, test_iterator):
            """Runs validation steps and aggregate metrics."""
            for _ in range(eval_steps):
                test_step(test_iterator)

            with eval_summary_writer.as_default():
                for metric in eval_metrics + model.metrics:
                    metric_value = _float_metric_value(metric)
                    logging.info('Step: [%d] Validation %s = %f',
                                 current_training_step, metric.name,
                                 metric_value)
                    tf.summary.scalar(metric.name,
                                      metric_value,
                                      step=current_training_step)
                eval_summary_writer.flush()

        def _run_callbacks_on_batch_begin(batch):
            """Runs custom callbacks at the start of every step."""
            if not custom_callbacks:
                return
            for callback in custom_callbacks:
                callback.on_batch_begin(batch)

        def _run_callbacks_on_batch_end(batch, logs):
            """Runs custom callbacks at the end of every step."""
            if not custom_callbacks:
                return
            for callback in custom_callbacks:
                callback.on_batch_end(batch, logs)

        # Training loop starts here.
        checkpoint = tf.train.Checkpoint(model=model, optimizer=optimizer)
        sub_model_checkpoint = tf.train.Checkpoint(
            model=sub_model) if sub_model_export_name else None

        latest_checkpoint_file = tf.train.latest_checkpoint(model_dir)
        if latest_checkpoint_file:
            logging.info(
                'Checkpoint file %s found and restoring from '
                'checkpoint', latest_checkpoint_file)
            checkpoint.restore(latest_checkpoint_file)
            logging.info('Loading from checkpoint file completed')

        current_step = optimizer.iterations.numpy()
        checkpoint_name = 'ctl_step_{step}.ckpt'

        while current_step < total_training_steps:
            # Training loss/metric are taking average over steps inside micro
            # training loop. We reset the their values before each round.
            train_loss_metric.reset_states()
            for metric in train_metrics + model.metrics:
                metric.reset_states()

            _run_callbacks_on_batch_begin(current_step)
            # Runs several steps in the host while loop.
            steps = steps_to_run(current_step, steps_per_epoch, steps_per_loop)

            if steps == 1:
                # TODO(zongweiz): merge with train_steps once tf.while_loop
                # GPU performance bugs are fixed.
                train_single_step(train_iterator)
            else:
                # Converts steps to a Tensor to avoid tf.function retracing.
                train_steps(train_iterator,
                            tf.convert_to_tensor(steps, dtype=tf.int32))
            train_loss = _float_metric_value(train_loss_metric)
            _run_callbacks_on_batch_end(current_step, {'loss': train_loss})
            current_step += steps

            # Updates training logging.
            training_status = 'Train Step: %d/%d  / loss = %s' % (
                current_step, total_training_steps, train_loss)

            if train_summary_writer:
                with train_summary_writer.as_default():
                    tf.summary.scalar(train_loss_metric.name,
                                      train_loss,
                                      step=current_step)
                    for metric in train_metrics + model.metrics:
                        metric_value = _float_metric_value(metric)
                        training_status += '  %s = %f' % (metric.name,
                                                          metric_value)
                        tf.summary.scalar(metric.name,
                                          metric_value,
                                          step=current_step)
                    train_summary_writer.flush()
            logging.info(training_status)

            # Saves model checkpoints and run validation steps at every epoch end.
            if current_step % steps_per_epoch == 0:
                # To avoid repeated model saving, we do not save after the last
                # step of training.
                if current_step < total_training_steps:
                    _save_checkpoint(checkpoint, model_dir,
                                     checkpoint_name.format(step=current_step))
                    if sub_model_export_name:
                        _save_checkpoint(
                            sub_model_checkpoint, model_dir,
                            '%s_step_%d.ckpt' %
                            (sub_model_export_name, current_step))
                if eval_input_fn:
                    logging.info('Running evaluation after step: %s.',
                                 current_step)
                    _run_evaluation(
                        current_step,
                        _get_input_iterator(eval_input_fn, strategy))
                    # Re-initialize evaluation metric.
                    for metric in eval_metrics + model.metrics:
                        metric.reset_states()

        _save_checkpoint(checkpoint, model_dir,
                         checkpoint_name.format(step=current_step))
        if sub_model_export_name:
            _save_checkpoint(sub_model_checkpoint, model_dir,
                             '%s.ckpt' % sub_model_export_name)

        if eval_input_fn:
            logging.info(
                'Running final evaluation after training is complete.')
            _run_evaluation(current_step,
                            _get_input_iterator(eval_input_fn, strategy))

        training_summary = {
            'total_training_steps': total_training_steps,
            'train_loss': _float_metric_value(train_loss_metric),
        }
        if eval_metrics:
            # TODO(hongkuny): Cleans up summary reporting in text.
            training_summary['last_train_metrics'] = _float_metric_value(
                train_metrics[0])
            training_summary['eval_metrics'] = _float_metric_value(
                eval_metrics[0])

        write_txt_summary(training_summary, summary_dir)

        return model
Ejemplo n.º 33
0
def run(target, is_chief, device_fn):
  """Run training.

  Args:
     target: The target of the TensorFlow standard server to use. Can be the
       empty string to run locally using an inprocess server.
     is_chief: Boolean indicating whether this process is the chief.
     device_fn: Device function used to assign ops to devices.
  """
  if not FLAGS.dataset_config_pbtxt:
    logging.error('Need to specify --dataset_config_pbtxt')
    return

  g = tf.Graph()
  with g.as_default():
    model = modeling.get_model(FLAGS.model_name)
    dataset = data_providers.get_dataset(FLAGS.dataset_config_pbtxt)
    print('Running training on {} with model {}\n'.format(dataset, model))

    with tf.device(device_fn):
      # If ps_tasks is zero, the local device is used. When using multiple
      # (non-local) replicas, the ReplicaDeviceSetter distributes the variables
      # across the different devices.
      images, labels, _ = data_providers.make_batches(
          dataset.get_slim_dataset(), model, FLAGS.batch_size, mode='TRAIN')
      endpoints = model.create(images, dataset.num_classes, is_training=True)
      labels = slim.one_hot_encoding(labels, dataset.num_classes)
      total_loss = loss(
          endpoints['Logits'], labels, label_smoothing=FLAGS.label_smoothing)

      # Setup the moving averages:
      moving_average_variables = slim.get_model_variables()
      moving_average_variables.extend(slim.losses.get_losses())
      moving_average_variables.append(total_loss)

      variable_averages = tf.train.ExponentialMovingAverage(
          FLAGS.moving_average_decay, slim.get_or_create_global_step())

      tf.add_to_collection(tf.GraphKeys.UPDATE_OPS,
                           variable_averages.apply(moving_average_variables))

      # Configure the learning rate using an exponetial decay.
      decay_steps = int(((1.0 * dataset.num_examples) / FLAGS.batch_size) *
                        FLAGS.num_epochs_per_decay)

      learning_rate = tf.train.exponential_decay(
          FLAGS.learning_rate,
          slim.get_or_create_global_step(),
          decay_steps,
          FLAGS.learning_rate_decay_factor,
          staircase=True)

      opt = tf.train.RMSPropOptimizer(learning_rate, FLAGS.rmsprop_decay,
                                      FLAGS.rmsprop_momentum,
                                      FLAGS.rmsprop_epsilon)

      # Create training op
      train_tensor = slim.learning.create_train_op(
          total_loss,
          optimizer=opt,
          update_ops=tf.get_collection(tf.GraphKeys.UPDATE_OPS))

      # Summaries:
      slim.summaries.add_histogram_summaries(slim.get_model_variables())
      slim.summaries.add_scalar_summaries(slim.losses.get_losses(), 'losses')
      slim.summaries.add_scalar_summary(total_loss, 'Total_Loss', 'losses')
      slim.summaries.add_scalar_summary(learning_rate, 'Learning_Rate',
                                        'training')
      slim.summaries.add_histogram_summaries(endpoints.values())
      slim.summaries.add_zero_fraction_summaries(endpoints.values())
      # redacted

      # Set start-up delay
      startup_delay_steps = FLAGS.task * FLAGS.startup_delay_steps

      init_fn = model_init_function(model, dataset.num_classes,
                                    FLAGS.start_from_checkpoint)

      saver = tf.train.Saver(
          max_to_keep=FLAGS.max_checkpoints_to_keep,
          keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours)

      # Train model
      slim.learning.train(
          train_tensor,
          number_of_steps=FLAGS.number_of_steps,
          logdir=FLAGS.train_dir,
          master=target,
          init_fn=init_fn,
          is_chief=is_chief,
          saver=saver,
          startup_delay_steps=startup_delay_steps,
          save_summaries_secs=FLAGS.save_summaries_secs,
          save_interval_secs=FLAGS.save_interval_secs)
Ejemplo n.º 34
0
def ddpg_graph(a_func,
               q_func,
               transition,
               target_network_type=DQNTarget.normal,
               gamma=1.0,
               dqda_clipping=0.0,
               loss_fn=tf.losses.huber_loss,
               extra_callback=None):
    """DDPG. https://arxiv.org/abs/1509.02971.

  Args:
    a_func: Python function that takes in state, scope as input
      and returns action and intermediate endpoints dictionary.
    q_func: Python function that takes in state, action, scope as input
      and returns Q(state, action) and intermediate endpoints dictionary.
    transition: SARSTransition namedtuple.
    target_network_type: Option to use Q Learning without target network, Q
      Learning with a target network (default), or Double-Q Learning with a
      target network.
    gamma: Discount factor.
    dqda_clipping: (float) clips the gradient dqda element-wise between
        [-dqda_clipping, dqda_clipping]. Does not perform clipping if
        dqda_clipping == 0.
    loss_fn: Function that computes the td_loss tensor. Takes as arguments
      (target value tensor, predicted value tensor).
    extra_callback: Optional function that takes in (transition, end_points_t,
      end_points_tp1) and adds additional TF graph elements.

  Returns:
    A tuple (loss, summaries) where loss is a scalar loss tensor to minimize,
    summaries are TensorFlow summaries.
  """
    state = transition.state
    action = transition.action
    state_p1 = transition.state_p1
    reward = transition.reward
    done = transition.done

    q_t_selected, end_points_t = q_func(state, action, scope='q_func')

    if gamma != 0:
        action_p1, _ = a_func(state_p1, scope='a_func')
        if target_network_type == DQNTarget.notarget:
            # Evaluate target values using the current net only.
            q_tp1_best, end_points_tp1 = q_func(state_p1,
                                                action_p1,
                                                scope='q_func',
                                                reuse=True)
        elif target_network_type == DQNTarget.normal:
            # Target network Q values at t+1.
            q_tp1_best, end_points_tp1 = q_func(state_p1,
                                                action_p1,
                                                scope='target_q_func')
        else:
            logging.error('Invalid target_network_mode %s',
                          target_network_type)
        q_tp1_best_masked = (1.0 - done) * q_tp1_best
        q_t_selected_target = tf.stop_gradient(reward +
                                               gamma * q_tp1_best_masked)
    else:
        # Supervised Target.
        q_t_selected_target = tf.stop_gradient(reward)

    # Critic Loss
    td_error = q_t_selected - q_t_selected_target
    critic_loss = loss_fn(q_t_selected_target, q_t_selected)

    # Actor Loss (maximize E[Q(a_t|s_t)] via policy grdient)
    policy_action, _ = a_func(state, scope='a_func', reuse=True)
    q_t, _ = q_func(state, policy_action, scope='q_func', reuse=True)

    dqda = tf.gradients(q_t, policy_action)[0]
    if dqda_clipping > 0:
        dqda = tf.clip_by_value(dqda, -dqda_clipping, dqda_clipping)
    actor_loss = tf.losses.mean_squared_error(
        tf.stop_gradient(dqda + policy_action), policy_action)
    loss = tf.losses.get_total_loss()

    if extra_callback is not None:
        extra_callback(transition, end_points_t, end_points_tp1)

    tf.summary.histogram('td_error', td_error)
    tf.summary.histogram('q_t_selected', q_t_selected)
    tf.summary.histogram('q_t_selected_target', q_t_selected_target)
    tf.summary.scalar('mean_q_t_selected', tf.reduce_mean(q_t_selected))
    tf.summary.scalar('critic_loss', critic_loss)
    tf.summary.scalar('actor_loss', actor_loss)
    tf.summary.scalar('actor_mean_q', tf.reduce_mean(q_t, 0))
    tf.summary.scalar('total_loss', loss)

    all_summaries = tf.summary.merge_all()

    # Make this a named tuple.
    return actor_loss, critic_loss, all_summaries
Ejemplo n.º 35
0
    def join(self, timeout=_DEFAULT_TIMEOUT_SEC):
        """Joins all the processes with timeout.

    If any of the subprocesses does not exit approximately after `timeout`
    seconds has passed after `join` call, this raises a
    `SubprocessTimeoutError`.

    Note: At timeout, it uses SIGTERM to terminate the subprocesses, in order to
    log the stack traces of the subprocesses when they exit. However, this
    results in timeout when the test runs with tsan (thread sanitizer); if tsan
    is being run on the test targets that rely on timeout to assert information,
    `MultiProcessRunner.terminate_all()` must be called after `join()`, before
    the test exits, so the subprocesses are terminated with SIGKILL, and data
    race is removed.

    Args:
      timeout: optional integer or `None`. If provided as an integer, and not
      all processes report status within roughly `timeout` seconds, a
      `SubprocessTimeoutError` exception will be raised. If `None`, `join` never
      times out.

    Returns:
      A MultiProcessRunnerResult object, which has two attributes,
      `return_value` and `stdout`. `return_value` always contains the return
      values from the subprocesses. If `return_output` argument is True at
      `__init__`, `stdout` is available that contains a list of all messages
      from subprocesses' stdout and stderr.

    Raises:
      SubprocessTimeoutError: if not all processes report status approximately
        within `timeout` seconds. When this is raised, a
        `MultiProcessRunnerResult` object can be retrieved by
        `SubprocessTimeoutError`'s mpr_result attribute, which has the same
        structure as above 'Returns' section describes.
      UnexpectedSubprocessExitError: If any of the subprocesses did not exit
        properly (for example, they exit on SIGTERM or SIGKILL signal). When
        this is raised, a `MultiProcessRunnerResult` object can be retrieved by
        `UnexpectedSubprocessExitError`'s mpr_result attribute, which has the
        same structure as above 'Returns' section describes. If `max_run_time`
        is not `None`, it is expected that some subprocesses may be
        force-killed when `max_run_time` is up, and this is raised in those
        cases.
      Exception: if there is an Exception propagated from any subprocess. When
        this is raised, a `MultiProcessRunnerResult` object can be retrieved by
        `UnexpectedSubprocessExitError`'s mpr_result attribute, which has the
        same structure as above 'Returns' section describes.
    """
        if timeout and not isinstance(timeout, int):
            raise ValueError('`timeout` must be an integer or `None`.')
        with self._process_lock:
            if self._joined:
                raise ValueError("MultiProcessRunner can't be joined twice.")
            self._joined = True

        self._watchdog_thread.join(timeout)
        if self._watchdog_thread.is_alive():
            # Timeout. Force termination to dump worker processes stack trace.
            with self._process_lock:
                self._auto_restart = False
            logging.error(
                'Timeout when joining for child processes. Terminating...')
            self.terminate_all(sig=signal.SIGTERM)
            # Wait for the processes to terminate by themselves first, so they have a
            # chance to dump stacktraces. After _FORCE_KILL_WAIT_SEC, we SIGKILL them.
            self._watchdog_thread.join(_FORCE_KILL_WAIT_SEC)
            if self._watchdog_thread.is_alive():
                logging.error('Timeout when waiting for child processes to '
                              'print stacktrace. Sending SIGKILL...')
                self.terminate_all()
                self._watchdog_thread.join()
            process_statuses = self._get_process_statuses()
            self._reraise_if_subprocess_error(process_statuses)
            raise SubprocessTimeoutError(
                'One or more subprocesses timed out, where timeout was set to {}s. '
                'Please change the `timeout` argument for '
                '`MultiProcessRunner.join()` or `multi_process_runner.run()` '
                'if it should be adjusted.'.format(timeout),
                self._get_mpr_result(process_statuses))

        for (task_type, task_id), p in self._processes.items():
            logging.info('%s-%d exit code: %s', task_type, task_id, p.exitcode)

        process_statuses = self._get_process_statuses()
        self._reraise_if_subprocess_error(process_statuses)

        # Checking all the processes that are expected to exit properly.
        for (task_type, task_id), p in self._processes.items():
            # Successfully exiting process has exit code 0. We ignore processes that
            # are terminated.
            assert p.exitcode is not None
            if (p.exitcode > 0
                    and (task_type, task_id) not in self._terminated):
                raise UnexpectedSubprocessExitError(
                    'Subprocess %s-%d exited with exit code %s. See logs for details.'
                    % (task_type, task_id, p.exitcode),
                    self._get_mpr_result(process_statuses))

        logging.info('Joining log reading threads.')
        for thread in self._reading_threads:
            thread.join()
        logging.info('Joined log reading threads.')

        # Clear the alarm.
        signal.alarm(0)

        return self._get_mpr_result(process_statuses)
Ejemplo n.º 36
0
def get_gin_bindings(exp, agent_name, initial_seed, value, test):
    gin_bindings = [f"{agent_name}.seed={initial_seed}"]
    if exp == "epsilon":
        gin_bindings += [f"create_opt.eps = {value}"]

    elif exp == "learning_rate":
        gin_bindings += [f"create_opt.learning_rate = {value}"]

    elif exp == "weight_decay":
        gin_bindings += [f"create_opt.weight_decay = {value}"]

    elif exp == "width":
        gin_bindings += [f"{agent_name}.neurons = {value}"]

    elif exp == "depth":
        gin_bindings += [f"{agent_name}.hidden_layer = {value}"]

    elif exp == "conv":
        gin_bindings += [f"{agent_name}.hidden_conv = {value}"]

    elif exp == "normalization":
        gin_bindings += [f"{agent_name}.normalization = '{value}'"]

    elif "init" in exp:
        gin_bindings = get_init_bidings(agent_name, value, initial_seed)

    elif exp == "activation":
        gin_bindings += [f"{agent_name}.layer_funct = '{value}'"]

    elif exp == "update_period":
        gin_bindings += [f"{agent_name}.update_period = {value}"]

    elif exp == "target_update_period":
        gin_bindings += [f"{agent_name}.target_update_period = {value}"]

    elif exp == "gamma":
        gin_bindings += [f"{agent_name}.gamma = {value}"]

    elif exp == "min_replay_history":
        gin_bindings += [f"{agent_name}.min_replay_history = {value}"]

    elif exp == "num_atoms":
        gin_bindings += [f"{agent_name}.num_atoms = {value}"]

    elif exp == "update_horizon":
        gin_bindings += [f"{agent_name}.update_horizon = {value}"]

    elif exp == "clip_rewards":
        gin_bindings += [f"Runner.clip_rewards = {value}"]

    elif exp == "batch_size":
        gin_bindings += [
            f"OutOfGraphPrioritizedReplayBuffer.batch_size = {value}"
        ]

    elif exp == "noisy_net":
        gin_bindings += [f"{agent_name}.noisy = {value}"]

    else:
        logging.error("Error! Check the kind of experiment")
        raise ValueError("Experiment not recognized")

    if test:
        gin_bindings.extend(
            ["Runner.num_iterations=4", "Runner.training_steps=200"])

    return gin_bindings