def test_invalid_sample_rate_negative(self):
     examples = [{}]
     with self.assertRaises(ValueError):
         with beam.Pipeline() as p:
             options = stats_options.StatsOptions(sample_rate=-1)
             _ = (p | beam.Create(examples)
                  | stats_api.GenerateStatistics(options))
    def test_stats_pipeline_with_sample_count(self):
        # input with three examples.
        examples = [{
            'c': np.linspace(1, 3000, 3000, dtype=np.int32)
        }, {
            'c': np.linspace(1, 3000, 3000, dtype=np.int32)
        }, {
            'c': np.linspace(1, 3000, 3000, dtype=np.int32)
        }]

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                sample_count=1,
                num_top_values=2,
                num_rank_histogram_buckets=2,
                num_values_histogram_buckets=2,
                num_histogram_buckets=2,
                num_quantiles_histogram_buckets=2,
                epsilon=0.001)
            result = (p | beam.Create(examples)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, self._sampling_test_expected_result))
    def test_stats_pipeline_with_sample_count(self):
        record_batches = [
            pa.RecordBatch.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
            pa.RecordBatch.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
            pa.RecordBatch.from_arrays(
                [pa.array([np.linspace(1, 3000, 3000, dtype=np.int32)])],
                ['c']),
        ]

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                sample_count=3000,
                num_top_values=2,
                num_rank_histogram_buckets=2,
                num_values_histogram_buckets=2,
                num_histogram_buckets=2,
                num_quantiles_histogram_buckets=2,
                epsilon=0.001,
                desired_batch_size=3000)
            result = (p | beam.Create(record_batches)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, self._sampling_test_expected_result))
 def test_invalid_feature_whitelist(self):
     examples = [{'a': np.array([1.0, 2.0])}]
     with self.assertRaises(TypeError):
         with beam.Pipeline() as p:
             options = stats_options.StatsOptions(feature_whitelist={})
             _ = (p | beam.Create(examples)
                  | stats_api.GenerateStatistics(options))
 def test_invalid_stats_options(self):
   record_batches = [pa.RecordBatch.from_arrays([])]
   with self.assertRaisesRegexp(TypeError, '.*should be a StatsOptions.'):
     with beam.Pipeline() as p:
       _ = (
           p | beam.Create(record_batches)
           | stats_api.GenerateStatistics(options={}))
 def test_invalid_both_sample_count_and_sample_rate(self):
     examples = [{}]
     with self.assertRaises(ValueError):
         with beam.Pipeline() as p:
             options = stats_options.StatsOptions(sample_count=100,
                                                  sample_rate=0.5)
             _ = (p | beam.Create(examples)
                  | stats_api.GenerateStatistics(options))
def generate_statistics_from_tfrecord(
        data_location,
        output_path=None,
        stats_options=options.StatsOptions(),
        pipeline_options=None,
):
    """Compute data statistics from TFRecord files containing TFExamples.

  Runs a Beam pipeline to compute the data statistics and return the result
  data statistics proto.

  This is a convenience method for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'GenerateStatistics'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    output_path: The file path to output data statistics result to. If None, we
      use a temporary directory. It will be a TFRecord file containing a single
      data statistics proto, and can be read with the 'load_statistics' API.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.gfile.Exists(output_dir_path):
        tf.gfile.MakeDirs(output_dir_path)

    # PyLint doesn't understand Beam PTransforms.
    # pylint: disable=no-value-for-parameter
    with beam.Pipeline(options=pipeline_options) as p:
        # Auto detect tfrecord file compression format based on input data
        # path suffix.
        _ = (
            p
            |
            'ReadData' >> beam.io.ReadFromTFRecord(file_pattern=data_location)
            | 'DecodeData' >> tf_example_decoder.DecodeTFExample()
            |
            'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
            # TODO(b/112014711) Implement a custom sink to write the stats proto.
            | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                output_path,
                shard_name_template='',
                coder=beam.coders.ProtoCoder(
                    statistics_pb2.DatasetFeatureStatisticsList)))
    return load_statistics(output_path)
def generate_statistics_from_tfrecord(
    data_location: Text,
    output_path: Optional[bytes] = None,
    stats_options: options.StatsOptions = options.StatsOptions(),
    pipeline_options: Optional[PipelineOptions] = None,
) -> statistics_pb2.DatasetFeatureStatisticsList:
  """Compute data statistics from TFRecord files containing TFExamples.

  Runs a Beam pipeline to compute the data statistics and return the result
  data statistics proto.

  This is a convenience method for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'GenerateStatistics'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    output_path: The file path to output data statistics result to. If None, we
      use a temporary directory. It will be a TFRecord file containing a single
      data statistics proto, and can be read with the 'load_statistics' API.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
  if output_path is None:
    output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord')
  output_dir_path = os.path.dirname(output_path)
  if not tf.io.gfile.exists(output_dir_path):
    tf.io.gfile.makedirs(output_dir_path)

  batch_size = stats_options.desired_batch_size
  # PyLint doesn't understand Beam PTransforms.
  # pylint: disable=no-value-for-parameter
  with beam.Pipeline(options=pipeline_options) as p:
    # Auto detect tfrecord file compression format based on input data
    # path suffix.
    _ = (
        p
        | 'ReadData' >> (tf_example_record.TFExampleRecord(
            file_pattern=data_location,
            schema=None,
            telemetry_descriptors=['tfdv', 'generate_statistics_from_tfrecord'])
                         .BeamSource(batch_size))
        | 'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
        | 'WriteStatsOutput' >>
        (stats_api.WriteStatisticsToTFRecord(output_path)))
  return stats_util.load_statistics(output_path)
Beispiel #9
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        split_uris = []
        for artifact in input_dict['input_data']:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            # TODO(b/126263006): Support more stats_options through config.
            stats_options = options.StatsOptions()
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                output_uri = artifact_utils.get_split_uri(
                    output_dict['output'], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
 def test_empty_input(self):
     examples = []
     expected_result = text_format.Parse(
         """
 datasets {
   num_examples: 0
 }
 """, statistics_pb2.DatasetFeatureStatisticsList())
     with beam.Pipeline() as p:
         result = p | beam.Create(examples) | stats_api.GenerateStatistics(
             stats_options.StatsOptions())
         util.assert_that(
             result,
             test_util.make_dataset_feature_stats_list_proto_equal_fn(
                 self, expected_result))
Beispiel #11
0
 def test_stats_pipeline_with_zero_examples(self):
     expected_result = statistics_pb2.DatasetFeatureStatisticsList()
     with beam.Pipeline() as p:
         options = stats_options.StatsOptions(
             num_top_values=1,
             num_rank_histogram_buckets=1,
             num_values_histogram_buckets=2,
             num_histogram_buckets=1,
             num_quantiles_histogram_buckets=1,
             epsilon=0.001)
         result = (p | beam.Create([])
                   | stats_api.GenerateStatistics(options))
         util.assert_that(
             result,
             test_util.make_dataset_feature_stats_list_proto_equal_fn(
                 self, expected_result))
Beispiel #12
0
    def Do(self, input_dict: Dict[Text, List[types.TfxType]],
           output_dict: Dict[Text, List[types.TfxType]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of 'ExamplesPath' type. This should contain both
          'train' and 'eval' split.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of 'ExampleStatisticsPath' type. This should contain
          both 'train' and 'eval' split.
      exec_properties: A dict of execution properties. Not used yet.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        split_to_instance = {x.split: x for x in input_dict['input_data']}
        with beam.Pipeline(argv=self._get_beam_pipeline_args()) as p:
            # TODO(b/126263006): Support more stats_options through config.
            stats_options = options.StatsOptions()
            for split, instance in split_to_instance.items():
                tf.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(instance.uri)
                output_uri = types.get_split_uri(output_dict['output'], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
            tf.logging.info('Statistics written to {}.'.format(output_uri))
 def test_invalid_stats_options(self):
     examples = [{'a': np.array([1.0, 2.0])}]
     with self.assertRaisesRegexp(TypeError, '.*should be a StatsOptions.'):
         with beam.Pipeline() as p:
             _ = (p | beam.Create(examples)
                  | stats_api.GenerateStatistics(options={}))
    def test_stats_pipeline_with_examples_with_no_values(self):
        record_batches = [
            pa.RecordBatch.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w']),
            pa.RecordBatch.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w']),
            pa.RecordBatch.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w'])
        ]

        expected_result = text_format.Parse(
            """
      datasets{
        num_examples: 3
        features {
          path {
            step: 'a'
          }
          type: FLOAT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'b'
          }
          type: STRING
          string_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'c'
          }
          type: INT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
          step: 'w'
        }
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 0
            min_num_values: 1
            max_num_values: 1
            avg_num_values: 1.0
            tot_num_values: 3
            num_values_histogram {
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 1.5
              }
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 1.5
              }
              type: QUANTILES
            }
            weighted_common_stats {
                num_non_missing: 6.0
                avg_num_values: 1.0
                tot_num_values: 6.0
            }
          }
          mean: 2.0
          std_dev: 0.0
          min: 2.0
          max: 2.0
          median: 2.0
          histograms {
            buckets {
              low_value: 2.0
              high_value: 2.0
              sample_count: 3.0
            }
            type: STANDARD
          }
          histograms {
            buckets {
              low_value: 2.0
              high_value: 2.0
              sample_count: 3.0
            }
            type: QUANTILES
          }
          weighted_numeric_stats {
            mean: 2.0
            median: 2.0
            histograms {
              buckets {
                low_value: 2.0
                high_value: 2.0
                sample_count: 6.0
              }
              type: STANDARD
            }
            histograms {
              buckets {
                low_value: 2.0
                high_value: 2.0
                sample_count: 6.0
              }
              type: QUANTILES
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())
        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                weight_feature='w',
                num_top_values=1,
                num_rank_histogram_buckets=1,
                num_values_histogram_buckets=2,
                num_histogram_buckets=1,
                num_quantiles_histogram_buckets=1,
                epsilon=0.001)
            result = (p | beam.Create(record_batches)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
    def test_stats_pipeline(self):
        record_batches = [
            pa.RecordBatch.from_arrays([
                pa.array([[1.0, 2.0]]),
                pa.array([['a', 'b', 'c', 'e']]),
                pa.array([np.linspace(1, 500, 500, dtype=np.int32)]),
            ], ['a', 'b', 'c']),
            pa.RecordBatch.from_arrays([
                pa.array([[3.0, 4.0, np.NaN, 5.0]]),
                pa.array([['a', 'c', 'd', 'a']]),
                pa.array([np.linspace(501, 1250, 750, dtype=np.int32)]),
            ], ['a', 'b', 'c']),
            pa.RecordBatch.from_arrays([
                pa.array([[1.0]]),
                pa.array([['a', 'b', 'c', 'd']]),
                pa.array([np.linspace(1251, 3000, 1750, dtype=np.int32)]),
            ], ['a', 'b', 'c'])
        ]

        expected_result = text_format.Parse(
            """
    datasets {
      num_examples: 3
      features {
        path {
          step: 'a'
        }
        type: FLOAT
        num_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 0
            min_num_values: 1
            max_num_values: 4
            avg_num_values: 2.33333333
            tot_num_values: 7
            num_values_histogram {
              buckets {
                low_value: 1.0
                high_value: 1.0
                sample_count: 1.0
              }
              buckets {
                low_value: 1.0
                high_value: 4.0
                sample_count: 1.0
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              type: QUANTILES
            }
          }
          mean: 2.66666666
          std_dev: 1.49071198
          num_zeros: 0
          min: 1.0
          max: 5.0
          median: 3.0
          histograms {
            num_nan: 1
            buckets {
              low_value: 1.0
              high_value: 2.3333333
              sample_count: 2.9866667
            }
            buckets {
              low_value: 2.3333333
              high_value: 3.6666667
              sample_count: 1.0066667
            }
            buckets {
              low_value: 3.6666667
              high_value: 5.0
              sample_count: 2.0066667
            }
            type: STANDARD
          }
          histograms {
            num_nan: 1
            buckets {
              low_value: 1.0
              high_value: 1.0
              sample_count: 1.5
            }
            buckets {
              low_value: 1.0
              high_value: 3.0
              sample_count: 1.5
            }
            buckets {
              low_value: 3.0
              high_value: 4.0
              sample_count: 1.5
            }
            buckets {
              low_value: 4.0
              high_value: 5.0
              sample_count: 1.5
            }
            type: QUANTILES
          }
        }
      }
      features {
        path {
          step: 'c'
        }
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 0
            min_num_values: 500
            max_num_values: 1750
            avg_num_values: 1000.0
            tot_num_values: 3000
            num_values_histogram {
              buckets {
                low_value: 500.0
                high_value: 500.0
                sample_count: 1.0
              }
              buckets {
                low_value: 500.0
                high_value: 1750.0
                sample_count: 1.0
              }
              buckets {
                low_value: 1750.0
                high_value: 1750.0
                sample_count: 1.0
              }
              type: QUANTILES
            }
          }
          mean: 1500.5
          std_dev: 866.025355672
          min: 1.0
          max: 3000.0
          median: 1501.0
          histograms {
            buckets {
              low_value: 1.0
              high_value: 1000.66666667
              sample_count: 999.666666667
            }
            buckets {
              low_value: 1000.66666667
              high_value: 2000.33333333
              sample_count: 999.666666667
            }
            buckets {
              low_value: 2000.33333333
              high_value: 3000.0
              sample_count: 1000.66666667
            }
            type: STANDARD
          }
          histograms {
            buckets {
              low_value: 1.0
              high_value: 751.0
              sample_count: 750.0
            }
            buckets {
              low_value: 751.0
              high_value: 1501.0
              sample_count: 750.0
            }
            buckets {
              low_value: 1501.0
              high_value: 2250.0
              sample_count: 750.0
            }
            buckets {
              low_value: 2250.0
              high_value: 3000.0
              sample_count: 750.0
            }
            type: QUANTILES
          }
        }
      }
      features {
        path {
          step: 'b'
        }
        type: STRING
        string_stats {
          common_stats {
            num_non_missing: 3
            min_num_values: 4
            max_num_values: 4
            avg_num_values: 4.0
            tot_num_values: 12
            num_values_histogram {
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              type: QUANTILES
            }
          }
          unique: 5
          top_values {
            value: "a"
            frequency: 4.0
          }
          top_values {
            value: "c"
            frequency: 3.0
          }
          avg_length: 1.0
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                num_top_values=2,
                num_rank_histogram_buckets=3,
                num_values_histogram_buckets=3,
                num_histogram_buckets=3,
                num_quantiles_histogram_buckets=4,
                epsilon=0.001)
            result = (p | beam.Create(record_batches)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
def generate_statistics_from_csv(
        data_location,
        column_names=None,
        delimiter=',',
        output_path=None,
        stats_options=options.StatsOptions(),
        pipeline_options=None,
):
    """Compute data statistics from CSV files.

  Runs a Beam pipeline to compute the data statistics and return the result
  data statistics proto.

  This is a convenience method for users with data in CSV format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'GenerateStatistics'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    column_names: A list of column names to be treated as the CSV header. Order
      must match the order in the input CSV files. If this argument is not
      specified, we assume the first line in the input CSV files as the
      header. Note that this option is valid only for 'csv' input file format.
    delimiter: A one-character string used to separate fields in a CSV file.
    output_path: The file path to output data statistics result to. If None, we
      use a temporary directory. It will be a TFRecord file containing a single
      data statistics proto, and can be read with the 'load_statistics' API.
    stats_options: Options for generating data statistics.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.gfile.Exists(output_dir_path):
        tf.gfile.MakeDirs(output_dir_path)

    # PyLint doesn't understand Beam PTransforms.
    # pylint: disable=no-value-for-parameter
    with beam.Pipeline(options=pipeline_options) as p:
        # If a header is not provided, assume the first line in a file
        # to be the header.
        skip_header_lines = 1 if column_names is None else 0
        if column_names is None:
            column_names = _get_csv_header(data_location, delimiter)
        _ = (
            p
            | 'ReadData' >> beam.io.textio.ReadFromText(
                file_pattern=data_location,
                skip_header_lines=skip_header_lines)
            | 'DecodeData' >> csv_decoder.DecodeCSV(
                column_names=column_names,
                delimiter=delimiter,
                schema=stats_options.schema,
                infer_type_from_schema=stats_options.infer_type_from_schema)
            |
            'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
            | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                output_path,
                shard_name_template='',
                coder=beam.coders.ProtoCoder(
                    statistics_pb2.DatasetFeatureStatisticsList)))
    return load_statistics(output_path)
Beispiel #17
0
  def Do(self, input_dict: Dict[str, List[types.Artifact]],
         output_dict: Dict[str, List[types.Artifact]],
         exec_properties: Dict[str, Any]) -> None:
    """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - examples: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - statistics: A list of type `standard_artifacts.ExampleStatistics`.
          This should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.
        - exclude_splits: JSON-serialized list of names of splits where
          statistics and sample should not be generated.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    # Load and deserialize exclude splits from execution properties.
    exclude_splits = json_utils.loads(
        exec_properties.get(standard_component_specs.EXCLUDE_SPLITS_KEY,
                            'null')) or []
    if not isinstance(exclude_splits, list):
      raise ValueError('exclude_splits in execution properties needs to be a '
                       'list. Got %s instead.' % type(exclude_splits))
    # Setup output splits.
    examples = artifact_utils.get_single_instance(
        input_dict[standard_component_specs.EXAMPLES_KEY])
    examples_split_names = artifact_utils.decode_split_names(
        examples.split_names)
    split_names = [
        split for split in examples_split_names if split not in exclude_splits
    ]
    statistics_artifact = artifact_utils.get_single_instance(
        output_dict[standard_component_specs.STATISTICS_KEY])
    statistics_artifact.split_names = artifact_utils.encode_split_names(
        split_names)

    stats_options = options.StatsOptions()
    stats_options_json = exec_properties.get(
        standard_component_specs.STATS_OPTIONS_JSON_KEY)
    if stats_options_json:
      # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
      # json_utils
      stats_options = options.StatsOptions.from_json(stats_options_json)
    if input_dict.get(standard_component_specs.SCHEMA_KEY):
      if stats_options.schema:
        raise ValueError('A schema was provided as an input and the '
                         'stats_options exec_property also contains a schema '
                         'value. At most one of these may be set.')
      else:
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(
                    input_dict[standard_component_specs.SCHEMA_KEY])))
        stats_options.schema = schema

    split_and_tfxio = []
    tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
        examples=[examples],
        telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
    for split in artifact_utils.decode_split_names(examples.split_names):
      if split in exclude_splits:
        continue

      uri = artifact_utils.get_split_uri([examples], split)
      split_and_tfxio.append(
          (split, tfxio_factory(io_utils.all_files_pattern(uri))))
    with self._make_beam_pipeline() as p:
      for split, tfxio in split_and_tfxio:
        logging.info('Generating statistics for split %s.', split)
        output_uri = artifact_utils.get_split_uri(
            output_dict[standard_component_specs.STATISTICS_KEY], split)
        output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
        data = p | 'TFXIORead[%s]' % split >> tfxio.BeamSource()
        _ = (
            data
            | 'GenerateStatistics[%s]' % split >>
            stats_api.GenerateStatistics(stats_options)
            | 'WriteStatsOutput[%s]' % split >>
            stats_api.WriteStatisticsToBinaryFile(output_path))
        logging.info('Statistics for split %s written to %s.', split,
                     output_uri)
Beispiel #18
0
    def test_stats_pipeline_with_examples_with_no_values(self):
        tables = [
            pa.Table.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w']),
            pa.Table.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w']),
            pa.Table.from_arrays([
                pa.array([[]], type=pa.list_(pa.float32())),
                pa.array([[]], type=pa.list_(pa.binary())),
                pa.array([[]], type=pa.list_(pa.int32())),
                pa.array([[2]]),
            ], ['a', 'b', 'c', 'w'])
        ]

        expected_result = text_format.Parse(
            """
      datasets{
        num_examples: 3
        features {
          path {
            step: 'a'
          }
          type: FLOAT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'b'
          }
          type: STRING
          string_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
        features {
          path {
            step: 'c'
          }
          type: INT
          num_stats {
            common_stats {
              num_non_missing: 3
              num_values_histogram {
                buckets {
                  sample_count: 1.5
                }
                buckets {
                  sample_count: 1.5
                }
                type: QUANTILES
              }
              weighted_common_stats {
                num_non_missing: 6
              }
            }
          }
        }
      }
    """, statistics_pb2.DatasetFeatureStatisticsList())
        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                weight_feature='w',
                num_top_values=1,
                num_rank_histogram_buckets=1,
                num_values_histogram_buckets=2,
                num_histogram_buckets=1,
                num_quantiles_histogram_buckets=1,
                epsilon=0.001)
            result = (p | beam.Create(tables)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
    def test_stats_pipeline_with_weight_feature(self):
        # input with four examples.
        examples = [{
            'a': np.array([1.0, 2.0]),
            'b': np.array(['a', 'b', 'c', 'e']),
            'w': np.array([1.0])
        }, {
            'a': np.array([3.0, 4.0, 5.0]),
            'b': None,
            'w': np.array([2.0])
        }, {
            'a': np.array([
                1.0,
            ]),
            'b': np.array(['d', 'e']),
            'w': np.array([
                3.0,
            ])
        }, {
            'a': None,
            'b': np.array(['a', 'c', 'd', 'a']),
            'w': np.array([1.0])
        }]

        expected_result = text_format.Parse(
            """
    datasets {
      num_examples: 4
      features {
        name: 'a'
        type: FLOAT
        num_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 1
            min_num_values: 1
            max_num_values: 3
            avg_num_values: 2.0
            tot_num_values: 6
            num_values_histogram {
              buckets { low_value: 1.0 high_value: 2.0 sample_count: 1 }
              buckets { low_value: 2.0 high_value: 3.0 sample_count: 1 }
              buckets { low_value: 3.0 high_value: 3.0 sample_count: 1 }
              type: QUANTILES
            }
            weighted_common_stats {
              num_non_missing: 6.0
              num_missing: 1.0
              avg_num_values: 1.83333333
              tot_num_values: 11.0
            }
          }
          mean: 2.66666666
          std_dev: 1.49071198
          num_zeros: 0
          min: 1.0
          max: 5.0
          median: 3.0
          histograms {
            buckets {
              low_value: 1.0
              high_value: 2.3333333
              sample_count: 2.9866667
            }
            buckets {
              low_value: 2.3333333
              high_value: 3.6666667
              sample_count: 1.0066667
            }
            buckets {
              low_value: 3.6666667
              high_value: 5.0
              sample_count: 2.0066667
            }
            type: STANDARD
          }
          histograms {
            buckets { low_value: 1.0 high_value: 1.0 sample_count: 1.5 }
            buckets { low_value: 1.0 high_value: 3.0 sample_count: 1.5 }
            buckets { low_value: 3.0 high_value: 4.0 sample_count: 1.5 }
            buckets { low_value: 4.0 high_value: 5.0 sample_count: 1.5 }
            type: QUANTILES
          }
          weighted_numeric_stats {
            mean: 2.7272727
            std_dev: 1.5427784
            median: 3.0
            histograms {
              buckets {
              low_value: 1.0
                high_value: 2.3333333
                sample_count: 4.9988889
              }
              buckets {
                low_value: 2.3333333
                high_value: 3.6666667
                sample_count: 1.9922222
              }
              buckets {
                low_value: 3.6666667
                high_value: 5.0
                sample_count: 4.0088889
              }
            }
            histograms {
              buckets { low_value: 1.0 high_value: 1.0 sample_count: 2.75 }
              buckets { low_value: 1.0 high_value: 3.0 sample_count: 2.75 }
              buckets { low_value: 3.0 high_value: 4.0 sample_count: 2.75 }
              buckets { low_value: 4.0 high_value: 5.0 sample_count: 2.75 }
              type: QUANTILES
            }
          }
        }
      }
      features {
        name: 'b'
        type: STRING
        string_stats {
          common_stats {
            num_non_missing: 3
            num_missing: 1
            min_num_values: 2
            max_num_values: 4
            avg_num_values: 3.33333301544
            num_values_histogram {
              buckets { low_value: 2.0 high_value: 4.0 sample_count: 1.0 }
              buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 }
              buckets { low_value: 4.0 high_value: 4.0 sample_count: 1.0 }
              type: QUANTILES
            }
            weighted_common_stats {
              num_non_missing: 5.0
              num_missing: 2.0
              avg_num_values: 2.8
              tot_num_values: 14.0
            }
            tot_num_values: 10
          }
          avg_length: 1.0
          unique: 5
          top_values { value: 'a' frequency: 3.0 }
          top_values { value: 'e' frequency: 2.0 }
          rank_histogram {
            buckets { low_rank: 0 high_rank: 0 label: "a" sample_count: 3.0 }
            buckets { low_rank: 1 high_rank: 1 label: "e" sample_count: 2.0 }
            buckets { low_rank: 2 high_rank: 2 label: "d" sample_count: 2.0 }
          }
          weighted_string_stats {
            top_values { value: 'e' frequency: 4.0 }
            top_values { value: 'd' frequency: 4.0 }
            rank_histogram {
              buckets { low_rank: 0 high_rank: 0 label: "e" sample_count: 4.0 }
              buckets { low_rank: 1 high_rank: 1 label: "d" sample_count: 4.0 }
              buckets { low_rank: 2 high_rank: 2 label: "a" sample_count: 3.0 }
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                weight_feature='w',
                num_top_values=2,
                num_rank_histogram_buckets=3,
                num_values_histogram_buckets=3,
                num_histogram_buckets=3,
                num_quantiles_histogram_buckets=4)
            result = (p | beam.Create(examples)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
Beispiel #20
0
  def Do(self, input_dict: Dict[Text, List[types.Artifact]],
         output_dict: Dict[Text, List[types.Artifact]],
         exec_properties: Dict[Text, Any]) -> None:
    """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
    self._log_startup(input_dict, output_dict, exec_properties)

    stats_options = options.StatsOptions()
    if STATS_OPTIONS_JSON_KEY in exec_properties:
      stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY]
      if stats_options_json:
        # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
        # json_utils
        stats_options = options.StatsOptions.from_json(stats_options_json)
    if input_dict.get(SCHEMA_KEY):
      if stats_options.schema:
        raise ValueError('A schema was provided as an input and the '
                         'stats_options exec_property also contains a schema '
                         'value. At most one of these may be set.')
      else:
        schema = io_utils.SchemaReader().read(
            io_utils.get_only_uri_in_dir(
                artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
        stats_options.schema = schema

    split_and_tfxio = []
    for artifact in input_dict[EXAMPLES_KEY]:
      tfxio_factory = tfxio_utils.get_tfxio_factory_from_artifact(
          examples=artifact, telemetry_descriptors=_TELEMETRY_DESCRIPTORS)
      for split in artifact_utils.decode_split_names(artifact.split_names):
        uri = os.path.join(artifact.uri, split)
        split_and_tfxio.append(
            (split, tfxio_factory(io_utils.all_files_pattern(uri))))
    with self._make_beam_pipeline() as p:
      for split, tfxio in split_and_tfxio:
        absl.logging.info('Generating statistics for split {}'.format(split))
        output_uri = artifact_utils.get_split_uri(output_dict[STATISTICS_KEY],
                                                  split)
        output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
        data = p | 'TFXIORead[{}]'.format(split) >> tfxio.BeamSource()
        _ = (
            data
            | 'GenerateStatistics[{}]'.format(split) >>
            stats_api.GenerateStatistics(stats_options)
            | 'WriteStatsOutput[{}]'.format(split) >>
            stats_api.WriteStatisticsToTFRecord(output_path))
        absl.logging.info('Statistics for split {} written to {}.'.format(
            split, output_uri))
Beispiel #21
0
def generate_statistics_from_csv(
    data_location: Text,
    column_names: Optional[List[types.FeatureName]] = None,
    delimiter: Text = ',',
    output_path: Optional[bytes] = None,
    stats_options: options.StatsOptions = options.StatsOptions(),
    pipeline_options: Optional[PipelineOptions] = None,
    compression_type: Text = CompressionTypes.AUTO,
) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Compute data statistics from CSV files.

  Runs a Beam pipeline to compute the data statistics and return the result
  data statistics proto.

  This is a convenience method for users with data in CSV format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'GenerateStatistics'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    column_names: A list of column names to be treated as the CSV header. Order
      must match the order in the input CSV files. If this argument is not
      specified, we assume the first line in the input CSV files as the
      header. Note that this option is valid only for 'csv' input file format.
    delimiter: A one-character string used to separate fields in a CSV file.
    output_path: The file path to output data statistics result to. If None, we
      use a temporary directory. It will be a TFRecord file containing a single
      data statistics proto, and can be read with the 'load_statistics' API.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.
    compression_type: Used to handle compressed input files. Default value is
      CompressionTypes.AUTO, in which case the file_path's extension will be
      used to detect the compression.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.io.gfile.exists(output_dir_path):
        tf.io.gfile.makedirs(output_dir_path)

    batch_size = (stats_options.desired_batch_size
                  if stats_options.desired_batch_size
                  and stats_options.desired_batch_size > 0 else
                  constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE)
    # PyLint doesn't understand Beam PTransforms.
    # pylint: disable=no-value-for-parameter
    with beam.Pipeline(options=pipeline_options) as p:
        # If a header is not provided, assume the first line in a file
        # to be the header.
        skip_header_lines = 1 if column_names is None else 0
        if column_names is None:
            column_names = get_csv_header(data_location, delimiter)
        _ = (
            p
            | 'ReadData' >> beam.io.textio.ReadFromText(
                file_pattern=data_location,
                skip_header_lines=skip_header_lines,
                compression_type=compression_type)
            | 'DecodeData' >> csv_decoder.DecodeCSV(
                column_names=column_names,
                delimiter=delimiter,
                schema=stats_options.schema
                if stats_options.infer_type_from_schema else None,
                desired_batch_size=batch_size)
            |
            'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
            | 'WriteStatsOutput' >>
            stats_api.WriteStatisticsToTFRecord(output_path))
    return stats_util.load_statistics(output_path)
 def test_invalid_custom_generators(self):
     examples = [{'a': np.array([1.0, 2.0])}]
     with self.assertRaises(TypeError):
         with beam.Pipeline() as p:
             _ = (p | beam.Create(examples) | stats_api.GenerateStatistics(
                 stats_options.StatsOptions(generators={})))
    def test_custom_generators(self):

        # Dummy PTransform that returns two DatasetFeatureStatistics protos.
        class CustomPTransform(beam.PTransform):
            def expand(self, pcoll):
                stats_proto1 = statistics_pb2.DatasetFeatureStatistics()
                proto1_feat = stats_proto1.features.add()
                proto1_feat.name = 'a'
                custom_stat1 = proto1_feat.custom_stats.add()
                custom_stat1.name = 'my_stat_a'
                custom_stat1.str = 'my_val_a'

                stats_proto2 = statistics_pb2.DatasetFeatureStatistics()
                proto2_feat = stats_proto2.features.add()
                proto2_feat.name = 'b'
                custom_stat2 = proto2_feat.custom_stats.add()
                custom_stat2.name = 'my_stat_b'
                custom_stat2.str = 'my_val_b'
                return [stats_proto1, stats_proto2]

        examples = [{
            'a': np.array([], dtype=np.int32),
            'b': np.array([], dtype=np.int32)
        }]
        expected_result = text_format.Parse(
            """
    datasets {
      num_examples: 1
      features {
        name: 'a'
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 1
            num_missing: 0
            tot_num_values: 0
            num_values_histogram {
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              type: QUANTILES
            }
          }
        }
        custom_stats {
          name: 'my_stat_a'
          str: 'my_val_a'
        }
      }
      features {
        name: 'b'
        type: INT
        num_stats {
          common_stats {
            num_non_missing: 1
            num_missing: 0
            tot_num_values: 0
            num_values_histogram {
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              buckets {
                low_value: 0
                high_value: 0
                sample_count: 0.5
              }
              type: QUANTILES
            }
          }
        }
        custom_stats {
          name: 'my_stat_b'
          str: 'my_val_b'
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        # Create a transform stats generator.
        transform_stats_gen = stats_generator.TransformStatsGenerator(
            name='CustomStatsGenerator', ptransform=CustomPTransform())
        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                generators=[transform_stats_gen],
                num_values_histogram_buckets=2)
            result = (p | beam.Create(examples)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
def generate_statistics_from_tfrecord(
    data_location: Text,
    output_path: Optional[bytes] = None,
    stats_options: options.StatsOptions = options.StatsOptions(),
    pipeline_options: Optional[PipelineOptions] = None,
    compression_type: Text = CompressionTypes.AUTO,
) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Compute data statistics from TFRecord files containing TFExamples.

  Runs a Beam pipeline to compute the data statistics and return the result
  data statistics proto.

  This is a convenience method for users with data in TFRecord format.
  Users with data in unsupported file/data formats, or users who wish
  to create their own Beam pipelines need to use the 'GenerateStatistics'
  PTransform API directly instead.

  Args:
    data_location: The location of the input data files.
    output_path: The file path to output data statistics result to. If None, we
      use a temporary directory. It will be a TFRecord file containing a single
      data statistics proto, and can be read with the 'load_statistics' API.
      If you run this function on Google Cloud, you must specify an
      output_path. Specifying None may cause an error.
    stats_options: `tfdv.StatsOptions` for generating data statistics.
    pipeline_options: Optional beam pipeline options. This allows users to
      specify various beam pipeline execution parameters like pipeline runner
      (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
      See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
      more details.
    compression_type: Used to handle compressed input files. Default value is
      CompressionTypes.AUTO, in which case the file_path's extension will be
      used to detect the compression.

  Returns:
    A DatasetFeatureStatisticsList proto.
  """
    if output_path is None:
        output_path = os.path.join(tempfile.mkdtemp(), 'data_stats.tfrecord')
    output_dir_path = os.path.dirname(output_path)
    if not tf.io.gfile.exists(output_dir_path):
        tf.io.gfile.makedirs(output_dir_path)

    batch_size = (stats_options.desired_batch_size
                  if stats_options.desired_batch_size
                  and stats_options.desired_batch_size > 0 else
                  constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE)
    # PyLint doesn't understand Beam PTransforms.
    # pylint: disable=no-value-for-parameter
    with beam.Pipeline(options=pipeline_options) as p:
        # Auto detect tfrecord file compression format based on input data
        # path suffix.
        _ = (
            p
            | 'ReadData' >> beam.io.ReadFromTFRecord(
                file_pattern=data_location, compression_type=compression_type)
            | 'DecodeData' >>
            tf_example_decoder.DecodeTFExample(desired_batch_size=batch_size)
            |
            'GenerateStatistics' >> stats_api.GenerateStatistics(stats_options)
            # TODO(b/112014711) Implement a custom sink to write the stats proto.
            | 'WriteStatsOutput' >>
            stats_api.WriteStatisticsToTFRecord(output_path))
    return stats_util.load_statistics(output_path)
    def test_stats_pipeline_with_feature_whitelist(self):
        # input with three examples.
        examples = [{
            'a': np.array([1.0, 2.0]),
            'b': np.array(['a', 'b', 'c', 'e']),
            'c': np.linspace(1, 500, 500, dtype=np.int32)
        }, {
            'a': np.array([3.0, 4.0, np.NaN, 5.0]),
            'b': np.array(['a', 'c', 'd', 'a']),
            'c': np.linspace(501, 1250, 750, dtype=np.int32)
        }, {
            'a': np.array([1.0]),
            'b': np.array(['a', 'b', 'c', 'd']),
            'c': np.linspace(1251, 3000, 1750, dtype=np.int32)
        }]

        expected_result = text_format.Parse(
            """
    datasets {
      num_examples: 3
      features {
        name: "b"
        type: STRING
        string_stats {
          common_stats {
            num_non_missing: 3
            min_num_values: 4
            max_num_values: 4
            avg_num_values: 4.0
            tot_num_values: 12
            num_values_histogram {
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              buckets {
                low_value: 4.0
                high_value: 4.0
                sample_count: 1.0
              }
              type: QUANTILES
            }
          }
          unique: 5
          top_values {
            value: "a"
            frequency: 4.0
          }
          top_values {
            value: "c"
            frequency: 3.0
          }
          avg_length: 1.0
          rank_histogram {
            buckets {
              low_rank: 0
              high_rank: 0
              label: "a"
              sample_count: 4.0
            }
            buckets {
              low_rank: 1
              high_rank: 1
              label: "c"
              sample_count: 3.0
            }
            buckets {
              low_rank: 2
              high_rank: 2
              label: "d"
              sample_count: 2.0
            }
          }
        }
      }
    }
    """, statistics_pb2.DatasetFeatureStatisticsList())

        with beam.Pipeline() as p:
            options = stats_options.StatsOptions(
                feature_whitelist=['b'],
                num_top_values=2,
                num_rank_histogram_buckets=3,
                num_values_histogram_buckets=3,
                num_histogram_buckets=3,
                num_quantiles_histogram_buckets=4)
            result = (p | beam.Create(examples)
                      | stats_api.GenerateStatistics(options))
            util.assert_that(
                result,
                test_util.make_dataset_feature_stats_list_proto_equal_fn(
                    self, expected_result))
Beispiel #26
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        stats_options = options.StatsOptions()
        if STATS_OPTIONS_JSON_KEY in exec_properties:
            stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY]
            if stats_options_json:
                # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
                # json_utils
                stats_options = options.StatsOptions.from_json(
                    stats_options_json)
        if input_dict.get(SCHEMA_KEY):
            if stats_options.schema:
                raise ValueError(
                    'A schema was provided as an input and the '
                    'stats_options exec_property also contains a schema '
                    'value. At most one of these may be set.')
            else:
                schema = io_utils.SchemaReader().read(
                    io_utils.get_only_uri_in_dir(
                        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
                stats_options.schema = schema

        split_uris = []
        for artifact in input_dict[EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                tfxio_kwargs = {'file_pattern': input_uri}
                # TODO(b/151624179): clean this up after tfx_bsl is released with the
                # below flag.
                if getattr(tfxio, 'TFXIO_HAS_TELEMETRY', False):
                    tfxio_kwargs[
                        'telemetry_descriptors'] = _TELEMETRY_DESCRIPTORS
                input_tfxio = tf_example_record.TFExampleRecord(**tfxio_kwargs)
                output_uri = artifact_utils.get_split_uri(
                    output_dict[STATISTICS_KEY], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                data = p | 'TFXIORead[{}]'.format(
                    split) >> input_tfxio.BeamSource()
                # TODO(b/153368237): Clean this up after a release post tfx 0.21.
                if not getattr(tfdv, 'TFDV_ACCEPT_RECORD_BATCH', False):
                    data |= 'RecordBatchToTable[{}]'.format(split) >> beam.Map(
                        lambda rb: pa.Table.from_batches([rb]))
                _ = (data
                     | 'GenerateStatistics[{}]'.format(split) >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput[{}]'.format(split) >>
                     beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
 def test_stats_pipeline_with_schema(self):
     # input with three examples.
     examples = [{
         'a': np.array([1, 3, 5, 7])
     }, {
         'a': np.array([2, 4, 6, 8])
     }, {
         'a': np.array([0, 3, 6, 9])
     }]
     schema = text_format.Parse(
         """
     feature {
       name: "a"
       type: INT
       int_domain {
         is_categorical: true
       }
     }
     """, schema_pb2.Schema())
     expected_result = text_format.Parse(
         """
 datasets {
   num_examples: 3
   features {
     name: "a"
     type: INT
     string_stats {
       common_stats {
         num_non_missing: 3
         min_num_values: 4
         max_num_values: 4
         avg_num_values: 4.0
         tot_num_values: 12
         num_values_histogram {
           buckets {
             low_value: 4.0
             high_value: 4.0
             sample_count: 1.0
           }
           buckets {
             low_value: 4.0
             high_value: 4.0
             sample_count: 1.0
           }
           buckets {
             low_value: 4.0
             high_value: 4.0
             sample_count: 1.0
           }
           type: QUANTILES
         }
       }
       unique: 10
       top_values {
         value: "6"
         frequency: 2.0
       }
       top_values {
         value: "3"
         frequency: 2.0
       }
       avg_length: 1.0
       rank_histogram {
         buckets {
           low_rank: 0
           high_rank: 0
           label: "6"
           sample_count: 2.0
         }
         buckets {
           low_rank: 1
           high_rank: 1
           label: "3"
           sample_count: 2.0
         }
         buckets {
           low_rank: 2
           high_rank: 2
           label: "9"
           sample_count: 1.0
         }
       }
     }
   }
 }
 """, statistics_pb2.DatasetFeatureStatisticsList())
     with beam.Pipeline() as p:
         options = stats_options.StatsOptions(
             schema=schema,
             num_top_values=2,
             num_rank_histogram_buckets=3,
             num_values_histogram_buckets=3)
         result = (p | beam.Create(examples)
                   | stats_api.GenerateStatistics(options))
         util.assert_that(
             result,
             test_util.make_dataset_feature_stats_list_proto_equal_fn(
                 self, expected_result))
Beispiel #28
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        """Computes stats for each split of input using tensorflow_data_validation.

    Args:
      input_dict: Input dict from input key to a list of Artifacts.
        - input_data: A list of type `standard_artifacts.Examples`. This should
          contain both 'train' and 'eval' split.
        - schema: Optionally, a list of type `standard_artifacts.Schema`. When
          the stats_options exec_property also contains a schema, this input
          should not be provided.
      output_dict: Output dict from output key to a list of Artifacts.
        - output: A list of type `standard_artifacts.ExampleStatistics`. This
          should contain both the 'train' and 'eval' splits.
      exec_properties: A dict of execution properties.
        - stats_options_json: Optionally, a JSON representation of StatsOptions.
          When a schema is provided as an input, the StatsOptions value should
          not also contain a schema.

    Raises:
      ValueError when a schema is provided both as an input and as part of the
      StatsOptions exec_property.

    Returns:
      None
    """
        self._log_startup(input_dict, output_dict, exec_properties)

        stats_options = options.StatsOptions()
        if STATS_OPTIONS_JSON_KEY in exec_properties:
            stats_options_json = exec_properties[STATS_OPTIONS_JSON_KEY]
            if stats_options_json:
                # TODO(b/150802589): Move jsonable interface to tfx_bsl and use
                # json_utils
                stats_options = options.StatsOptions.from_json(
                    stats_options_json)
        if input_dict.get(SCHEMA_KEY):
            if stats_options.schema:
                raise ValueError(
                    'A schema was provided as an input and the '
                    'stats_options exec_property also contains a schema '
                    'value. At most one of these may be set.')
            else:
                schema = io_utils.SchemaReader().read(
                    io_utils.get_only_uri_in_dir(
                        artifact_utils.get_single_uri(input_dict[SCHEMA_KEY])))
                stats_options.schema = schema

        split_uris = []
        for artifact in input_dict[EXAMPLES_KEY]:
            for split in artifact_utils.decode_split_names(
                    artifact.split_names):
                uri = os.path.join(artifact.uri, split)
                split_uris.append((split, uri))
        with self._make_beam_pipeline() as p:
            for split, uri in split_uris:
                absl.logging.info(
                    'Generating statistics for split {}'.format(split))
                input_uri = io_utils.all_files_pattern(uri)
                output_uri = artifact_utils.get_split_uri(
                    output_dict[STATISTICS_KEY], split)
                output_path = os.path.join(output_uri, _DEFAULT_FILE_NAME)
                _ = (p
                     | 'ReadData.' + split >>
                     beam.io.ReadFromTFRecord(file_pattern=input_uri)
                     | 'DecodeData.' + split >>
                     tf_example_decoder.DecodeTFExample()
                     | 'GenerateStatistics.' + split >>
                     stats_api.GenerateStatistics(stats_options)
                     | 'WriteStatsOutput.' + split >> beam.io.WriteToTFRecord(
                         output_path,
                         shard_name_template='',
                         coder=beam.coders.ProtoCoder(
                             statistics_pb2.DatasetFeatureStatisticsList)))
                absl.logging.info(
                    'Statistics for split {} written to {}.'.format(
                        split, output_uri))
 def test_stats_pipeline_with_examples_with_no_values(self):
   examples = [{'a': np.array([], dtype=np.floating),
                'b': np.array([], dtype=np.object),
                'c': np.array([], dtype=np.int32),
                'w': np.array([2])},
               {'a': np.array([], dtype=np.floating),
                'b': np.array([], dtype=np.object),
                'c': np.array([], dtype=np.int32),
                'w': np.array([2])},
               {'a': np.array([], dtype=np.floating),
                'b': np.array([], dtype=np.object),
                'c': np.array([], dtype=np.int32),
                'w': np.array([2])}]
   expected_result = text_format.Parse(
       """
     datasets{
       num_examples: 3
       features {
         name: 'a'
         type: FLOAT
         num_stats {
           common_stats {
             num_non_missing: 3
             num_values_histogram {
               buckets {
                 sample_count: 1.5
               }
               buckets {
                 sample_count: 1.5
               }
               type: QUANTILES
             }
             weighted_common_stats {
               num_non_missing: 6
             }
           }
         }
       }
       features {
         name: 'b'
         type: STRING
         string_stats {
           common_stats {
             num_non_missing: 3
             num_values_histogram {
               buckets {
                 sample_count: 1.5
               }
               buckets {
                 sample_count: 1.5
               }
               type: QUANTILES
             }
             weighted_common_stats {
               num_non_missing: 6
             }
           }
         }
       }
       features {
         name: 'c'
         type: INT
         num_stats {
           common_stats {
             num_non_missing: 3
             num_values_histogram {
               buckets {
                 sample_count: 1.5
               }
               buckets {
                 sample_count: 1.5
               }
               type: QUANTILES
             }
             weighted_common_stats {
               num_non_missing: 6
             }
           }
         }
       }
     }
   """, statistics_pb2.DatasetFeatureStatisticsList())
   with beam.Pipeline() as p:
     options = stats_options.StatsOptions(
         weight_feature='w',
         num_top_values=1,
         num_rank_histogram_buckets=1,
         num_values_histogram_buckets=2,
         num_histogram_buckets=1,
         num_quantiles_histogram_buckets=1,
         epsilon=0.001)
     result = (
         p | beam.Create(examples) | stats_api.GenerateStatistics(options))
     util.assert_that(
         result,
         test_util.make_dataset_feature_stats_list_proto_equal_fn(
             self, expected_result))