Ejemplo n.º 1
0
def _setup_outputs(root_output_dir, experiment_name, hparam_dict):
    """Set up directories for experiment loops, write hyperparameters to disk."""

    if not experiment_name:
        raise ValueError('experiment_name must be specified.')

    create_if_not_exists(root_output_dir)

    checkpoint_dir = os.path.join(root_output_dir, 'checkpoints',
                                  experiment_name)
    create_if_not_exists(checkpoint_dir)
    checkpoint_mngr = tff.simulation.FileCheckpointManager(checkpoint_dir)

    results_dir = os.path.join(root_output_dir, 'results', experiment_name)
    create_if_not_exists(results_dir)
    csv_file = os.path.join(results_dir, 'experiment.metrics.csv')
    metrics_mngr = tff.simulation.CSVMetricsManager(csv_file)

    summary_logdir = os.path.join(root_output_dir, 'logdir', experiment_name)
    create_if_not_exists(summary_logdir)
    summary_writer = tf.summary.create_file_writer(summary_logdir)

    if hparam_dict:
        hparam_dict['metrics_file'] = metrics_mngr.metrics_filename
        hparams_file = os.path.join(results_dir, 'hparams.csv')
        utils_impl.atomic_write_to_csv(pd.Series(hparam_dict), hparams_file)
        with summary_writer.as_default():
            hp.hparams({k: v for k, v in hparam_dict.items() if v is not None})

    logging.info('Writing...')
    logging.info('    checkpoints to: %s', checkpoint_dir)
    logging.info('    metrics csv to: %s', metrics_mngr.metrics_filename)
    logging.info('    summaries to: %s', summary_logdir)

    return checkpoint_mngr, metrics_mngr, summary_writer
Ejemplo n.º 2
0
  def __init__(self, experiment_name, output_dir, hparam_dict):
    """Returns an initalized `MetricsHook`.

    Args:
      experiment_name: A unique filesystem-friendly name for the experiment.
      output_dir: A root output directory used for all experiment runs in a
        grid. The `MetricsHook` will combine this with `experiment_name` to form
        suitable output directories for this run.
      hparam_dict: A dictionary of hyperparameters to be recorded to .csv and
        exported to TensorBoard.
    """

    summary_logdir = os.path.join(output_dir,
                                  'logdir/{}'.format(experiment_name))
    _check_not_exists(summary_logdir, FLAGS.disable_check_exists)
    tf.io.gfile.makedirs(summary_logdir)

    self._summary_writer = tf.summary.create_file_writer(
        summary_logdir, name=experiment_name)
    with self._summary_writer.as_default():
      hp.hparams(hparam_dict)

    self._results_file = os.path.join(output_dir, experiment_name,
                                      'results.csv.bz2')

    # Also write the hparam_dict to a CSV:
    hparam_dict['results_file'] = self._results_file
    hparams_file = os.path.join(output_dir, experiment_name, 'hparams.csv')
    utils_impl.atomic_write_to_csv(pd.Series(hparam_dict), hparams_file)

    logging.info('Writing ...')
    logging.info('   result csv to: %s', self._results_file)
    logging.info('    summaries to: %s', summary_logdir)

    _check_not_exists(self._results_file, FLAGS.disable_check_exists)
Ejemplo n.º 3
0
    def clear_rounds_after(self, last_valid_round_num: int) -> None:
        """Metrics for rounds greater than `last_valid_round_num` are cleared out.

    By using this method, this class can be used upon restart of an experiment
    at `last_valid_round_num` to ensure that no duplicate rows of data exist in
    the CSV file. This method will atomically update the stored CSV file.

    Args:
      last_valid_round_num: All metrics for rounds later than this are expunged.

    Raises:
      RuntimeError: If metrics do not exist (none loaded during construction '
        nor recorded via `update_metrics()` and `last_valid_round_num` is not
        zero.
      ValueError: If `last_valid_round_num` is negative.
    """
        if last_valid_round_num < 0:
            raise ValueError('Attempting to clear metrics after round '
                             f'{last_valid_round_num}, which is negative.')
        if self._latest_round_num is None:
            if last_valid_round_num == 0:
                return
            raise RuntimeError('Metrics do not exist yet.')
        self._metrics = self._metrics.drop(self._metrics[
            self._metrics.round_num > last_valid_round_num].index)
        utils_impl.atomic_write_to_csv(self._metrics, self._metrics_filename)
        self._latest_round_num = last_valid_round_num
    def test_atomic_read(self, name):
        dataframe = pd.DataFrame(dict(a=[1, 2], b=[4.0, 5.0]))
        csv_file = os.path.join(absltest.get_default_test_tmpdir(), name)
        utils_impl.atomic_write_to_csv(dataframe, csv_file)

        dataframe2 = utils_impl.atomic_read_from_csv(csv_file)
        pd.testing.assert_frame_equal(dataframe, dataframe2)
 def test_atomic_write_raises_on_dict_input(self):
     output_file = os.path.join(absltest.get_default_test_tmpdir(),
                                'foo.csv')
     with self.assertRaisesRegex(
             ValueError,
             'dataframe must be an instance of `pandas.DataFrame`'):
         utils_impl.atomic_write_to_csv(dict(a=1), output_file)
Ejemplo n.º 6
0
    def clear_all_rounds(self) -> None:
        """Existing metrics for all rounds are cleared out.

    This method will atomically update the stored CSV file.
    """
        self._metrics = pd.DataFrame()
        utils_impl.atomic_write_to_csv(self._metrics, self._metrics_filename)
        self._latest_round_num = None
    def __init__(self,
                 root_metrics_dir: str = '/tmp',
                 prefix: str = 'experiment',
                 use_bz2: bool = True):
        """Returns an initialized `ScalarMetricsManager`.

    This class will maintain metrics in a CSV file in the filesystem. The path
    of the file is {`root_metrics_dir`}/{`prefix`}.metrics.csv (if use_bz2 is
    set to False) or {`root_metrics_dir`}/{`prefix`}.metrics.csv.bz2 (if
    use_bz2 is set to True). To use this class upon restart of an experiment at
    an earlier round number, you can initialize and then call the
    clear_rounds_after() method to remove all rows for round numbers later than
    the restart round number. This ensures that no duplicate rows of data exist
    in the CSV.

    Args:
      root_metrics_dir: A path on the filesystem to store CSVs.
      prefix: A string to use as the prefix of filename. Usually the name of a
        specific run in a larger grid of experiments sharing a common
        `root_metrics_dir`.
      use_bz2: A boolean indicating whether to zip the result metrics csv using
        bz2.

    Raises:
      ValueError: If `root_metrics_dir` is empty string.
      ValueError: If `prefix` is empty string.
      ValueError: If the specified metrics csv file already exists but does not
        contain a `round_num` column.
    """
        super().__init__()
        if not root_metrics_dir:
            raise ValueError(
                'Empty string passed for root_metrics_dir argument.')
        if not prefix:
            raise ValueError('Empty string passed for prefix argument.')

        if use_bz2:
            # Using .bz2 rather than .zip due to
            # https://github.com/pandas-dev/pandas/issues/26023
            self._metrics_filename = os.path.join(root_metrics_dir,
                                                  f'{prefix}.metrics.csv.bz2')
        else:
            self._metrics_filename = os.path.join(root_metrics_dir,
                                                  f'{prefix}.metrics.csv')
        if not tf.io.gfile.exists(self._metrics_filename):
            utils_impl.atomic_write_to_csv(pd.DataFrame(),
                                           self._metrics_filename)

        self._metrics = utils_impl.atomic_read_from_csv(self._metrics_filename)
        if not self._metrics.empty and 'round_num' not in self._metrics.columns:
            raise ValueError(
                f'The specified csv file ({self._metrics_filename}) already exists '
                'but was not created by ScalarMetricsManager (it does not contain a '
                '`round_num` column.')

        self._latest_round_num = (None if self._metrics.empty else
                                  self._metrics.round_num.max(axis=0))
Ejemplo n.º 8
0
    def update_metrics(self, round_num,
                       metrics_to_append: Dict[str, Any]) -> Dict[str, float]:
        """Updates the stored metrics data with metrics for a specific round.

    The specified `round_num` must be later than the latest round number for
    which metrics exist in the stored metrics data. This method will atomically
    update the stored CSV file. Also, if stored metrics already exist and
    `metrics_to_append` contains a new, previously unseen metric name, a new
    column in the dataframe will be added for that metric, and all previous rows
    will fill in with NaN values for the metric.

    Args:
      round_num: Communication round at which `metrics_to_append` was collected.
      metrics_to_append: A dictionary of metrics collected during `round_num`.
        These metrics can be in a nested structure, but the nesting will be
        flattened for storage in the CSV (with the new keys equal to the paths
        in the nested structure).

    Returns:
      A `collections.OrderedDict` of the data just added in a new row to the
        pandas.DataFrame. Compared with the input `metrics_to_append`, this data
        is flattened, with the key names equal to the path in the nested
        structure. Also, `round_num` has been added as an additional key.

    Raises:
      ValueError: If the provided round number is negative.
      ValueError: If the provided round number is less than or equal to the
        latest round number in the stored metrics data.
    """
        if round_num < 0:
            raise ValueError(
                f'Attempting to append metrics for round {round_num}, '
                'which is negative.')
        if self._latest_round_num and round_num <= self._latest_round_num:
            raise ValueError(
                f'Attempting to append metrics for round {round_num}, '
                'but metrics already exist through round '
                f'{self._latest_round_num}.')

        # Add the round number to the metrics before storing to csv file. This will
        # be used if a restart occurs, to identify which metrics to trim in the
        # _clear_invalid_rounds() method.
        metrics_to_append['round_num'] = round_num

        flat_metrics = tree.flatten_with_path(metrics_to_append)
        flat_metrics = [('/'.join(map(str, path)), item)
                        for path, item in flat_metrics]
        flat_metrics = collections.OrderedDict(flat_metrics)
        self._metrics = self._metrics.append(flat_metrics, ignore_index=True)
        utils_impl.atomic_write_to_csv(self._metrics, self._metrics_filename)
        self._latest_round_num = round_num

        return flat_metrics
    def test_atomic_write(self, name):
        dataframe = pd.DataFrame(dict(a=[1, 2], b=[4.0, 5.0]))
        output_file = os.path.join(absltest.get_default_test_tmpdir(), name)
        utils_impl.atomic_write_to_csv(dataframe, output_file)
        dataframe2 = pd.read_csv(output_file, index_col=0)
        pd.testing.assert_frame_equal(dataframe, dataframe2)

        # Overwriting
        dataframe3 = pd.DataFrame(dict(a=[1, 2, 3], b=[4.0, 5.0, 6.0]))
        utils_impl.atomic_write_to_csv(dataframe3, output_file)
        dataframe4 = pd.read_csv(output_file, index_col=0)
        pd.testing.assert_frame_equal(dataframe3, dataframe4)
 def on_epoch_end(self, epoch: int, logs: Dict[Any, Any] = None):
     results_path = os.path.join(self._path, 'metric_results.csv')
     if tf.io.gfile.exists(results_path):
         # Read the results until now.
         results_df = utils_impl.atomic_read_from_csv(results_path)
         # Slice off results after the current epoch, this indicates the job
         # restarted.
         results_df = results_df[:epoch]
         # Add the new epoch.
         results_df = results_df.append(logs, ignore_index=True)
     else:
         results_df = pd.DataFrame(logs, index=[epoch])
     utils_impl.atomic_write_to_csv(results_df, results_path)
Ejemplo n.º 11
0
    def test_constructor_raises_value_error_if_csvfile_is_invalid(self):
        dataframe_missing_round_num = pd.DataFrame.from_dict(
            _create_dummy_metrics())

        temp_dir = self.get_temp_dir()
        # This csvfile is 'invalid' in that it was not originally created by an
        # instance of ScalarMetricsManager, and is missing a column for
        # round_num.
        invalid_csvfile = os.path.join(temp_dir, 'foo.metrics.csv.bz2')
        utils_impl.atomic_write_to_csv(dataframe_missing_round_num,
                                       invalid_csvfile)

        with self.assertRaises(ValueError):
            metrics_manager.ScalarMetricsManager(temp_dir, prefix='foo')
Ejemplo n.º 12
0
def _setup_outputs(root_output_dir,
                   experiment_name,
                   hparam_dict,
                   rounds_per_profile=0):
    """Set up directories for experiment loops, write hyperparameters to disk."""

    if not experiment_name:
        raise ValueError('experiment_name must be specified.')

    create_if_not_exists(root_output_dir)

    checkpoint_dir = os.path.join(root_output_dir, 'checkpoints',
                                  experiment_name)
    create_if_not_exists(checkpoint_dir)
    checkpoint_mngr = tff.simulation.FileCheckpointManager(checkpoint_dir)

    results_dir = os.path.join(root_output_dir, 'results', experiment_name)
    create_if_not_exists(results_dir)
    csv_file = os.path.join(results_dir, 'experiment.metrics.csv')
    metrics_mngr = tff.simulation.CSVMetricsManager(csv_file)

    summary_logdir = os.path.join(root_output_dir, 'logdir', experiment_name)
    tb_mngr = tensorboard_manager.TensorBoardManager(
        summary_dir=summary_logdir)

    if hparam_dict:
        hparam_dict['metrics_file'] = metrics_mngr.metrics_filename
        hparams_file = os.path.join(results_dir, 'hparams.csv')
        utils_impl.atomic_write_to_csv(pd.Series(hparam_dict), hparams_file)
        tb_mngr.update_hparams(
            {k: v
             for k, v in hparam_dict.items() if v is not None})

    logging.info('Writing...')
    logging.info('    checkpoints to: %s', checkpoint_dir)
    logging.info('    metrics csv to: %s', metrics_mngr.metrics_filename)
    logging.info('    summaries to: %s', summary_logdir)

    @contextlib.contextmanager
    def profiler(round_num):
        if (rounds_per_profile > 0 and round_num % rounds_per_profile == 0):
            with tf.profiler.experimental.Profile(summary_logdir):
                yield
        else:
            yield

    return checkpoint_mngr, metrics_mngr, tb_mngr, profiler
Ejemplo n.º 13
0
def _setup_outputs(root_output_dir,
                   experiment_name,
                   hparam_dict,
                   write_metrics_with_bz2=True,
                   rounds_per_profile=0):
  """Set up directories for experiment loops, write hyperparameters to disk."""

  if not experiment_name:
    raise ValueError('experiment_name must be specified.')

  create_if_not_exists(root_output_dir)

  checkpoint_dir = os.path.join(root_output_dir, 'checkpoints', experiment_name)
  create_if_not_exists(checkpoint_dir)
  checkpoint_mngr = checkpoint_manager.FileCheckpointManager(checkpoint_dir)

  results_dir = os.path.join(root_output_dir, 'results', experiment_name)
  create_if_not_exists(results_dir)
  metrics_mngr = metrics_manager.ScalarMetricsManager(
      results_dir, use_bz2=write_metrics_with_bz2)

  summary_logdir = os.path.join(root_output_dir, 'logdir', experiment_name)
  create_if_not_exists(summary_logdir)
  summary_writer = tf.summary.create_file_writer(summary_logdir)

  if hparam_dict:
    hparam_dict['metrics_file'] = metrics_mngr.metrics_filename
    hparams_file = os.path.join(results_dir, 'hparams.csv')
    utils_impl.atomic_write_to_csv(pd.Series(hparam_dict), hparams_file)
    with summary_writer.as_default():
      hp.hparams({k: v for k, v in hparam_dict.items() if v is not None})

  logging.info('Writing...')
  logging.info('    checkpoints to: %s', checkpoint_dir)
  logging.info('    metrics csv to: %s', metrics_mngr.metrics_filename)
  logging.info('    summaries to: %s', summary_logdir)

  @contextlib.contextmanager
  def profiler(round_num):
    if (rounds_per_profile > 0 and round_num % rounds_per_profile == 0):
      with tf.profiler.experimental.Profile(summary_logdir):
        yield
    else:
      yield

  return checkpoint_mngr, metrics_mngr, summary_writer, profiler
Ejemplo n.º 14
0
    def __call__(self, train_metrics, eval_metrics, round_num):
        """A function suitable for passing as an eval hook to the training_loop.

    Args:
      train_metrics: A `dict` of training metrics computed in TFF.
      eval_metrics: A `dict` of evalutation metrics computed in TFF.
      round_num: The current round number.
    """
        metrics = {
            'train': train_metrics,
            'eval': eval_metrics,
            'round': round_num,
        }
        flat_metrics = tree.flatten_with_path(metrics)
        flat_metrics = [('/'.join(map(str, path)), item)
                        for path, item in flat_metrics]
        flat_metrics = collections.OrderedDict(flat_metrics)

        logging.info('Evaluation at round {:d}:\n{!s}'.format(
            round_num, pprint.pformat(flat_metrics)))

        # Also write metrics to a tf.summary logdir
        with self._summary_writer.as_default():
            for name, value in flat_metrics.items():
                tf.summary.scalar(name, value, step=round_num)

        if tf.io.gfile.exists(self._results_file):
            metrics = pd.read_csv(self._results_file,
                                  header=0,
                                  index_col=0,
                                  engine='c')
            # Remove everything after `round_num`, in case the experiment was
            # restarted at an earlier checkpoint we want to avoid duplicate metrics.
            metrics = metrics[:round_num]
            metrics = metrics.append(flat_metrics, ignore_index=True)
        else:
            metrics = pd.DataFrame(flat_metrics, index=[0])
        utils_impl.atomic_write_to_csv(metrics, self._results_file)
Ejemplo n.º 15
0
def run(
    keras_model: tf.keras.Model,
    train_dataset: tf.data.Dataset,
    experiment_name: str,
    root_output_dir: str,
    num_epochs: int,
    hparams_dict: Optional[Dict[str, Any]] = None,
    decay_epochs: Optional[int] = None,
    lr_decay: Optional[float] = None,
    validation_dataset: Optional[tf.data.Dataset] = None,
    test_dataset: Optional[tf.data.Dataset] = None
) -> tf.keras.callbacks.History:
    """Run centralized training for a given compiled `tf.keras.Model`.

  Args:
    keras_model: A compiled `tf.keras.Model`.
    train_dataset: The `tf.data.Dataset` to be used for training.
    experiment_name: Name of the experiment, used as part of the name of the
      output directory.
    root_output_dir: The top-level output directory. The directory
      `root_output_dir/experiment_name` will contain TensorBoard logs, metrics
      CSVs and other outputs.
    num_epochs: How many training epochs to perform.
    hparams_dict: An optional dict specifying hyperparameters. If provided, the
      hyperparameters will be written to CSV.
    decay_epochs: Number of training epochs before decaying the learning rate.
    lr_decay: How much to decay the learning rate by every `decay_epochs`.
    validation_dataset: An optional `tf.data.Dataset` used for validation during
      training.
    test_dataset: An optional `tf.data.Dataset` used for testing after all
      training has completed.

  Returns:
    A `tf.keras.callbacks.History` object.
  """
    tensorboard_dir = os.path.join(root_output_dir, 'logdir', experiment_name)
    results_dir = os.path.join(root_output_dir, 'results', experiment_name)

    for path in [root_output_dir, tensorboard_dir, results_dir]:
        tf.io.gfile.makedirs(path)

    if hparams_dict:
        hparams_file = os.path.join(results_dir, 'hparams.csv')
        logging.info('Saving hyper parameters to: [%s]', hparams_file)
        hparams_df = pd.DataFrame(hparams_dict, index=[0])
        utils_impl.atomic_write_to_csv(hparams_df, hparams_file)

    csv_logger_callback = keras_callbacks.AtomicCSVLogger(results_dir)
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir=tensorboard_dir)
    training_callbacks = [tensorboard_callback, csv_logger_callback]

    if decay_epochs is not None and decay_epochs > 0:
        # Reduce the learning rate after a fixed number of epochs.
        def decay_lr(epoch, learning_rate):
            if (epoch + 1) % decay_epochs == 0:
                return learning_rate * lr_decay
            else:
                return learning_rate

        lr_callback = tf.keras.callbacks.LearningRateScheduler(decay_lr,
                                                               verbose=1)
        training_callbacks.append(lr_callback)

    logging.info('Training model:')
    logging.info(keras_model.summary())

    history = keras_model.fit(train_dataset,
                              validation_data=validation_dataset,
                              epochs=num_epochs,
                              callbacks=training_callbacks)

    logging.info('Final training metrics:')
    for metric in keras_model.metrics:
        name = metric.name
        metric = history.history[name][-1]
        logging.info('\t%s: %.4f', name, metric)

    if validation_dataset:
        logging.info('Final validation metrics:')
        for metric in keras_model.metrics:
            name = metric.name
            metric = history.history['val_{}'.format(name)][-1]
            logging.info('\t%s: %.4f', name, metric)

    if test_dataset:
        test_metrics = keras_model.evaluate(test_dataset, return_dict=True)
        logging.info('Test metrics:')
        for metric in keras_model.metrics:
            name = metric.name
            metric = test_metrics[name]
            logging.info('\t%s: %.4f', name, metric)

    return history