def setUp(self):
     self.image_data = test_utils.get_test_df()
     self.split_key = schema.get_key(schema.SplitKeyType,
                                     schema.image_csv_schema)
     self.label_key = schema.get_key(schema.StringLabelType,
                                     schema.image_csv_schema)
     self.image_uri_key = schema.get_key(schema.ImageUriType,
                                         schema.image_csv_schema)
     self.tempfiles = []
     self.tempdir = None
    def setUp(self):
        """Test setup."""

        image_height = 40
        image_width = 30
        image_channels = 3
        image_fn = functools.partial(test_utils.make_random_image,
                                     image_height, image_width, image_channels)

        data = test_utils.get_test_data()
        image_uri_key = schema.get_key(schema.ImageUriType,
                                       schema.image_csv_schema)
        num_records = len(data[image_uri_key])
        image_uris = data.pop(image_uri_key)
        data['image_name'] = [os.path.split(uri)[-1] for uri in image_uris]
        data.update({
            'image':
            [beam_image.encode(image_fn()) for _ in range(num_records)],
            'image_height': [image_height] * num_records,
            'image_width': [image_width] * num_records,
            'image_channels': [image_channels] * num_records,
        })
        self.num_records = num_records
        self.data = data
        self.dataset = tf.data.Dataset.from_tensor_slices(self.data)
Beispiel #3
0
  def test_no_get_split_key(self):
    """Tests no split key present."""
    test_schema = dict()
    for k, v in schema.image_csv_schema.items():
      # Brute force copy because OG is a FrozenOrderedDict.
      if k != 'split':
        test_schema[k] = v

    key = schema.get_key(schema.SplitKeyType, test_schema)
    self.assertIsNone(key)
def get_raw_feature_df() -> pd.DataFrame:
  """Returns test dataframe having raw feature spec schema."""

  df = get_test_df()
  my_raw_schema = schema.get_raw_schema_map(schema.image_csv_schema)
  image_key = schema.get_key(schema.ImageUriType, schema.image_csv_schema)
  df.drop([image_key], axis=1, inplace=True)
  df['image_name'] = 'image_name'
  df['image'] = 'image'
  # Note: TF Transform parser expects string values in input. They will
  # be parsed based on the raw feature spec that is passed together with the
  # data
  df['image_height'] = '48'
  df['image_width'] = '48'
  df['image_channels'] = '3'
  df = df[my_raw_schema.keys()]

  return df
Beispiel #5
0
 def test_valid_get_key(self):
   """Tests a valid split key."""
   key = schema.get_key(schema.SplitKeyType, schema.image_csv_schema)
   self.assertEqual(key, 'split')
Beispiel #6
0
 def setUp(self):
   self.df = test_utils.get_test_df()
   self.schema_map = schema.image_csv_schema
   self.split_key = schema.get_key(schema.SplitKeyType, self.schema_map)
def build_pipeline(
    df: pd.DataFrame,
    job_label: str,
    runner: str,
    project: str,
    region: str,
    output_dir: str,
    compression: str,
    num_shards: int,
    schema_map: Dict[str, collections.namedtuple],
    tfrecorder_wheel: str,
    dataflow_options: Dict[str, Any]) -> beam.Pipeline:
  """Runs TFRecorder Beam Pipeline.

  Args:
    df: Pandas DataFrame
    job_label: User description for the beam job.
    runner: Beam Runner: (e.g. DataflowRunner, DirectRunner).
    project: GCP project ID (if DataflowRunner)
    region: GCP compute region (if DataflowRunner)
    output_dir: GCS or Local Path for output.
    compression: gzip or None.
    num_shards: Number of shards.
    schema_map: A schema map (Dictionary mapping Dataframe columns to types)
     used to derive the input and target schema.
    tfrecorder_wheel: Path to TFRecorder wheel for DataFlow
    dataflow_options: Dataflow Runner Options (optional)

  Returns:
    beam.Pipeline

  Note: These inputs must be validated upstream (by client.create_tfrecord())
  """

  job_name = _get_job_name(job_label)
  job_dir = _get_job_dir(output_dir, job_name)
  options = _get_pipeline_options(
      runner,
      job_name,
      job_dir,
      project,
      region,
      tfrecorder_wheel,
      dataflow_options)

  p = beam.Pipeline(options=options)
  with tft_beam.Context(temp_dir=os.path.join(job_dir, 'tft_tmp')):

    converter = schema.get_tft_coder(df.columns, schema_map)
    flatten_rows = ToCSVRows()

    # Each element in the data PCollection will be a dict
    # including the image_csv_columns and the image features created from
    # extract_images_fn.
    data = (
        p
        | 'ReadFromDataFrame' >> beam.Create(df.values.tolist())
        | 'ToCSVRows' >> beam.ParDo(flatten_rows)
        | 'DecodeCSV' >> beam.Map(converter.decode)
    )

    # Extract images if an image_uri key exists.
    image_uri_key = schema.get_key(schema.ImageUriType, schema_map)
    if image_uri_key:
      extract_images_fn = beam_image.ExtractImagesDoFn(image_uri_key)

      data = (
          data
          | 'ReadImage' >> beam.ParDo(extract_images_fn)
      )

    # If the schema contains a valid split key, partition the dataset.
    split_key = schema.get_key(schema.SplitKeyType, schema_map)

    # Note: This will not always reflect actual number of samples per dataset
    # written as TFRecords. The succeeding `Partition` operation may mark
    # additional samples from other splits as discarded. If a split has all
    # its samples discarded, the pipeline will still generate a TFRecord
    # file for that split, albeit empty.
    split_counts = get_split_counts(df, split_key)

    # Raw metadata is the TFT metadata after image insertion but before TFT
    # e.g Image columns have been added if necessary.
    raw_metadata = schema.get_raw_metadata(df.columns, schema_map)

    # Require training set to be available in the input data. The transform_fn
    # and transformed_metadata will be generated from the training set and
    # applied to the other datasets, if any
    assert 'TRAIN' in split_counts

    # Split dataset into train, validation, test sets.
    partition_fn = functools.partial(_partition_fn, split_key=split_key)
    train_data, val_data, test_data, discard_data = (
        data | 'SplitDataset' >> beam.Partition(
            partition_fn, len(schema.SplitKeyType.allowed_values)))

    raw_schema_map = schema.get_raw_schema_map(schema_map=schema_map)
    preprocessing_fn = functools.partial(
        _preprocessing_fn,
        schema_map=raw_schema_map)

    tfr_writer = functools.partial(
        _get_write_to_tfrecord, output_dir=job_dir, compress=compression,
        num_shards=num_shards)

    transform_fn = _transform_and_write_tfr(
        train_data, tfr_writer, preprocessing_fn=preprocessing_fn,
        raw_metadata=raw_metadata,
        label='Train')

    if 'VALIDATION' in split_counts:
      _transform_and_write_tfr(
          val_data, tfr_writer, transform_fn=transform_fn,
          raw_metadata=raw_metadata,
          label='Validation')

    if 'TEST' in split_counts:
      _transform_and_write_tfr(
          test_data, tfr_writer, transform_fn=transform_fn,
          raw_metadata=raw_metadata,
          label='Test')

    _ = (
        discard_data
        | 'WriteDiscardedData' >> beam.io.WriteToText(
            os.path.join(job_dir, 'discarded-data')))

    # Note: `transform_fn` already contains the transformed metadata
    _ = (transform_fn | 'WriteTransformFn' >> tft_beam.WriteTransformFn(
        job_dir))

  return p