Example #1
0
    def test_invalid_copy(self):
        """Test invalid file copy."""
        with tempfile.TemporaryDirectory() as tmpdirname:
            infile = os.path.join(tmpdirname, 'foo.txt')
            with self.assertRaises(FileNotFoundError):
                common.copy_logfile_to_gcs(infile, tmpdirname)


# pylint: disable=protected-access
    def test_valid_copy(self):
        """Test valid file copy."""
        with tempfile.TemporaryDirectory() as tmpdirname:
            text = 'log test log test'
            infile = os.path.join(tmpdirname, 'foo.log')
            with open(infile, 'w') as f:
                f.write(text)
            common.copy_logfile_to_gcs(infile, tmpdirname)

            outfile = os.path.join(tmpdirname, constants.LOGFILE)
            with open(outfile, 'r') as f:
                data = f.read()
                self.assertEqual(text, data)
Example #3
0
def create_tfrecords(
    input_data: Union[str, pd.DataFrame],
    output_dir: str,
    header: Optional[Union[str, int, Sequence]] = 'infer',
    names: Optional[Sequence] = None,
    runner: str = 'DirectRunner',
    project: Optional[str] = None,
    region: Optional[str] = None,
    tfrecorder_wheel: Optional[str] = None,
    dataflow_options: Optional[Dict[str, Any]] = None,
    job_label: str = 'create-tfrecords',
    compression: Optional[str] = 'gzip',
    num_shards: int = 0) -> Dict[str, Any]:
  """Generates TFRecord files from given input data.

  TFRecorder provides an easy interface to create image-based tensorflow records
  from a dataframe containing GCS locations of the images and labels.

  Usage:
    import tfrecorder

    job_id = tfrecorder.client.create_tfrecords(
        train_df,
        output_dir='gcs://foo/bar/train',
        runner='DirectFlowRunner)

  Args:
    input_data: Pandas DataFrame, CSV file or image directory path.
    output_dir: Local directory or GCS Location to save TFRecords to.
    header: Indicates row/s to use as a header. Not used when `input_data` is
      a Pandas DataFrame.
      If 'infer' (default), header is taken from the first line of a CSV
    runner: Beam runner. Can be 'DirectRunner' or 'DataFlowRunner'
    project: GCP project name (Required if DataflowRunner)
    region: GCP region name (Required if DataflowRunner)
    tfrecorder_wheel: Required for GCP Runs, path to the tfrecorder whl.
    dataflow_options: Options dict for DataflowRunner
    job_label: User supplied description for the Beam job name.
    compression: Can be 'gzip' or None for no compression.
    num_shards: Number of shards to divide the TFRecords into. Default is
        0 = no sharding.

  Returns:
    job_results: Dict
      job_id: Dataflow Job ID or 'DirectRunner'
      metrics: (optional) Beam metrics. Only used for DirectRunner
      dataflow_url: (optional) Job URL for DataflowRunner
  """

  df = to_dataframe(input_data, header, names)

  _validate_data(df)
  _validate_runner(df, runner, project, region, tfrecorder_wheel)

  logfile = os.path.join('/tmp', constants.LOGFILE)
  _configure_logging(logfile)


  integer_label = pd.api.types.is_integer_dtype(df[constants.LABEL_KEY])
  p = beam_pipeline.build_pipeline(
      df,
      job_label=job_label,
      runner=runner,
      project=project,
      region=region,
      output_dir=output_dir,
      compression=compression,
      num_shards=num_shards,
      tfrecorder_wheel=tfrecorder_wheel,
      dataflow_options=dataflow_options,
      integer_label=integer_label)

  result = p.run()

  if runner == 'DirectRunner':
    logging.info('Using DirectRunner - blocking until job completes.')
    result.wait_until_finish()

    row_count_filter = beam.metrics.MetricsFilter().with_name('row_count')
    good_image_filter = beam.metrics.MetricsFilter().with_name('image_good')
    bad_image_filter = beam.metrics.MetricsFilter().with_name('image_bad')

    row_count = _get_beam_metric(row_count_filter, result)
    good_image_count = _get_beam_metric(good_image_filter, result)
    bad_image_count = _get_beam_metric(bad_image_filter, result)

    # TODO(mikebernico): Profile metric impact with larger dataset.
    metrics = {
        'rows': row_count,
        'good_images': good_image_count,
        'bad_images': bad_image_count,
    }

    job_result = {
        'job_id': 'DirectRunner',
        'metrics': metrics
    }
    logging.info("Job Complete.")

  else:
    logging.info("Using Dataflow Runner.")
    # Construct Dataflow URL

    job_id = result.job_id()

    url = (
        constants.CONSOLE_DATAFLOW_URI +
        region +
        '/' +
        job_id +
        '?project=' +
        project)
    job_result = {
        'job_id': job_id,
        'dataflow_url': url
    }

  logging.shutdown()

  if runner == 'DataflowRunner':
    # if this is a Dataflow job, copy the logfile to GCS
    common.copy_logfile_to_gcs(logfile, output_dir)

  return job_result