Beispiel #1
0
    def test_batch_examples(self):
        examples = [{
            'a': np.array([1.0, 2.0], dtype=np.float32),
            'b': np.array(['a', 'b', 'c', 'e'])
        }, {
            'a': np.array([3.0, 4.0, 5.0], dtype=np.float32),
        }, {
            'b': np.array(['d', 'e', 'f']),
            'd': np.array([10, 20, 30], dtype=np.int64),
        }, {
            'b': np.array(['a', 'b', 'c'])
        }, {
            'c': np.array(['d', 'e', 'f'])
        }]
        expected_tables = [
            pa.Table.from_arrays([
                pa.array([[1.0, 2.0], [3.0, 4.0, 5.0]],
                         type=pa.list_(pa.float32())),
                pa.array([['a', 'b', 'c', 'e'], None])
            ], ['a', 'b']),
            pa.Table.from_arrays([
                pa.array([['d', 'e', 'f'], ['a', 'b', 'c']]),
                pa.array([[10, 20, 30], None], type=pa.list_(pa.int64()))
            ], ['b', 'd']),
            pa.Table.from_arrays([pa.array([['d', 'e', 'f']])], ['c']),
        ]

        with beam.Pipeline() as p:
            result = (
                p
                | beam.Create(examples)
                | batch_util.BatchExamplesToArrowTables(desired_batch_size=2))
            util.assert_that(
                result,
                test_util.make_arrow_tables_equal_fn(self, expected_tables))
Beispiel #2
0
  def _ComputeTFDVStats(pcollection: beam.pvalue.PCollection,
                        schema: schema_pb2.Schema) -> beam.pvalue.PCollection:
    """Cmoputes Statistics with TFDV.

    Args:
      pcollection: pcollection of examples.
      schema: schema.

    Returns:
      PCollection of `DatasetFeatureStatisticsList`.
    """
    feature_specs_from_schema = schema_utils.schema_as_feature_spec(
        schema).feature_spec

    def EncodeTFDV(element, feature_specs):
      """Encodes element in an in-memory format that TFDV expects."""
      if _TRANSFORM_INTERNAL_FEATURE_FOR_KEY not in element:
        raise ValueError(
            'Expected _TRANSFORM_INTERNAL_FEATURE_FOR_KEY ({}) to exist in the '
            'input but not found.'.format(_TRANSFORM_INTERNAL_FEATURE_FOR_KEY))

      # TODO(b/123549935): Obviate the numpy array conversions by
      # allowing TFDV to accept primitives in general, and TFT's
      # input/output format in particular.
      result = {}
      for feature_name, feature_spec in six.iteritems(feature_specs):
        feature_value = element.get(feature_name)
        if feature_value is None:
          result[feature_name] = None
        elif isinstance(feature_value, (np.ndarray, list)):
          result[feature_name] = np.asarray(
              feature_value, feature_spec.dtype.as_numpy_dtype)
        else:
          result[feature_name] = np.asarray(
              [feature_value], dtype=feature_spec.dtype.as_numpy_dtype)

      return result

    result = (pcollection
              # TODO(kestert): Remove encoding and batching steps once TFT
              # supports Arrow tables.
              | 'EncodeTFDV' >> beam.Map(
                  EncodeTFDV, feature_specs=feature_specs_from_schema))

    # TODO(pachristopher): Remove this once TFDV 0.14 is released.
    (major, minor, _) = tfdv.__version__.split('.')
    if int(major) > 0 or int(minor) >= 14:
      result |= ('BatchExamplesToArrowTables' >>
                 batch_util.BatchExamplesToArrowTables())

    return (result
            | 'ComputeFeatureStatisticsTFDV' >> tfdv.GenerateStatistics(
                tfdv.StatsOptions(schema=schema)))
def compute_stats(input_handle,
                  stats_path,
                  max_rows=None,
                  for_eval=False,
                  pipeline_args=None):
    """Computes statistics on the input data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    stats_path: Directory in which stats are materialized.
    max_rows: Number of rows to query from BigQuery
    for_eval: Query for eval set rows from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """

    with beam.Pipeline(argv=pipeline_args) as pipeline:
        if input_handle.lower().endswith('csv'):
            raw_data = (pipeline
                        | 'ReadData' >> beam.io.textio.ReadFromText(
                            file_pattern=input_handle, skip_header_lines=1)
                        | 'DecodeData' >> csv_decoder.DecodeCSV(
                            column_names=taxi.CSV_COLUMN_NAMES))
        else:
            query = taxi.make_sql(table_name=input_handle,
                                  max_rows=max_rows,
                                  for_eval=for_eval)
            raw_data = (
                pipeline
                | 'ReadBigQuery' >> beam.io.Read(
                    beam.io.BigQuerySource(query=query, use_standard_sql=True))
                | 'ConvertToTFDVInput' >> beam.Map(
                    lambda x: {
                        key: np.asarray([x[key]])  # pylint: disable=g-long-lambda
                        for key in x if x[key] is not None
                    }))
            # TODO(pachristopher): Remove this once TFDV 0.14 is released.
            (major, minor, _) = tfdv.__version__.split('.')
            if int(major) > 0 or int(minor) >= 14:
                raw_data |= ('BatchExamplesToArrowTables' >>
                             batch_util.BatchExamplesToArrowTables())

        _ = (raw_data
             | 'GenerateStatistics' >> tfdv.GenerateStatistics()
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 stats_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))
    def expand(self, lines):
        """Decodes the input CSV records into Arrow tables.

    Args:
      lines: A PCollection of strings representing the lines in the CSV file.

    Returns:
      A PCollection of Arrow tables.
    """
        return (lines
                | 'DecodeCSVToDict' >> DecodeCSVToDict(
                    column_names=self._column_names,
                    delimiter=self._delimiter,
                    skip_blank_lines=self._skip_blank_lines,
                    schema=self._schema,
                    infer_type_from_schema=self._infer_type_from_schema)
                | 'BatchExamplesToArrowTables' >>
                batch_util.BatchExamplesToArrowTables(
                    desired_batch_size=self._desired_batch_size))
Beispiel #5
0
def DecodeTFExample(
    examples,
    desired_batch_size=constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE):  # pylint: disable=invalid-name
    """Decodes serialized TF examples into an Arrow table.

  Args:
    examples: A PCollection of strings representing serialized TF examples.
    desired_batch_size: Batch size. The output Arrow tables will have as many
      rows as the `desired_batch_size`.

  Returns:
    A PCollection of Arrow tables.
  """
    decoder = TFExampleDecoder()
    return (
        examples
        | 'ParseTFExamples' >> beam.Map(decoder.decode)
        |
        'BatchExamplesToArrowTables' >> batch_util.BatchExamplesToArrowTables(
            desired_batch_size=desired_batch_size))
Beispiel #6
0
def generate_drift_reports(
        request_response_log_table: str,
        instance_type: InstanceType,    
        feature_names: List[str],
        start_time: datetime.datetime,
        end_time: datetime.datetime,
        output_path: GCSPath,
        schema: schema_pb2.Schema,
        baseline_stats: statistics_pb2.DatasetFeatureStatisticsList,
        stats_options: stats_options.StatsOptions = stats_options.StatsOptions(),
        pipeline_options: Optional[PipelineOptions] = None,       
):
    """Computes statistics and anomalies for a time window in AI Platform Prediction
    request-response log.
  
    Args:
      request_response_log_table: A full name of a BigQuery table
        with the request_response_log
      instance_type: The type of instances logged in the request_response_log_table.
        Currently, the only supported instance types are: a simple list (InstanceType.SIMPLE_LIST)
        and a JSON object (InstanceType(JSON_OBJECT))
      feature_names: A list of feature names. Must be provided if the instance_type is
        InstanceType(SIMPLE_LIST)
      start_time: The beginning of a time window.
      end_time: The end of a time window.
      output_path: The GCS location to output the statistics and anomalies
        proto buffers to. The file names will be `stats.pb` and `anomalies.pbtxt`. 
      schema: A Schema protobuf describing the expected schema.
      stats_options: `tfdv.StatsOptions` for generating data statistics.
      pipeline_options: Optional beam pipeline options. This allows users to
        specify various beam pipeline execution parameters like pipeline runner
        (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
        See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
        more details.
    """

    query = _generate_query(request_response_log_table, start_time, end_time)    
    stats_output_path = os.path.join(output_path, _STATS_FILENAME)
    anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME)
    
    with beam.Pipeline(options=pipeline_options) as p:
        raw_examples = ( p
                   | 'GetData' >> beam.io.Read(beam.io.BigQuerySource(query=query, use_standard_sql=True)))
        
        if instance_type == InstanceType.SIMPLE_LIST:
            examples = (raw_examples
                       | 'SimpleInstancesToBeamExamples' >> beam.ParDo(SimpleListCoder(feature_names)))
        elif instance_type == InstanceType.JSON_OBJECT:
            examples = (raw_examples
                       | 'JSONObjectInstancesToBeamExamples' >> beam.ParDo(JSONObjectCoder()))  
        else:
            raise TypeError("Unsupported instance type")
            
        stats = (examples
                | 'BeamExamplesToArrow' >> batch_util.BatchExamplesToArrowTables()
                | 'GenerateStatistics' >> tfdv.GenerateStatistics(stats_options)
                )
        
        _ = (stats       
            | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                  file_path_prefix=stats_output_path,
                  shard_name_template='',
                  coder=beam.coders.ProtoCoder(
                      statistics_pb2.DatasetFeatureStatisticsList)))
        
        _ = (stats
            | 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema)
            | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText(
                                            file_path_prefix=anomalies_output_path,
                                            shard_name_template='',
                                            append_trailing_newlines=False))
Beispiel #7
0
def generate_statistics_from_bq(
    query: Text,
    output_path: Text,
    schema: schema_pb2.Schema,
    stats_options: stats_options.StatsOptions = stats_options.StatsOptions(),
    pipeline_options: Optional[PipelineOptions] = None,
) -> statistics_pb2.DatasetFeatureStatisticsList:
    """Computes data statistics from a BigQuery query result.
  
    Args:
      query: The BigQuery query.
      output_path: The file path to output data statistics result to. 
        It will be a TFRecord file containing a single
        data statistics proto, and can be read with the 'load_statistics' API.
        If you run this function on Google Cloud, you must specify an
        output_path. Specifying None may cause an error.
      schema: A Schema protobuf to use for data validation
      stats_options: `tfdv.StatsOptions` for generating data statistics.
      pipeline_options: Optional beam pipeline options. This allows users to
        specify various beam pipeline execution parameters like pipeline runner
        (DirectRunner or DataflowRunner), cloud dataflow service project id, etc.
        See https://cloud.google.com/dataflow/pipelines/specifying-exec-params for
        more details.
    Returns:
      A DatasetFeatureStatisticsList proto.
    """

    column_specs = _get_column_specs(query)
    if not validate_bq_types(_get_column_specs(query).values()):
        raise ValueError("Unsupported BigQuery data types.")

    batch_size = (stats_options.desired_batch_size
                  if stats_options.desired_batch_size
                  and stats_options.desired_batch_size > 0 else
                  tfdv.constants.DEFAULT_DESIRED_INPUT_BATCH_SIZE)
    # PyLint doesn't understand Beam PTransforms.
    # pylint: disable=no-value-for-parameter

    stats_output_path = os.path.join(output_path, _STATS_FILENAME)
    anomalies_output_path = os.path.join(output_path, _ANOMALIES_FILENAME)

    with beam.Pipeline(options=pipeline_options) as p:
        stats = (
            p
            | 'GetData' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True))
            #        | 'DecodeData' >>  DecodeBigQuery(column_specs,
            #                                          desired_batch_size=batch_size)
            | 'DecodeExamples' >> batch_util.BatchExamplesToArrowTables()
            | 'GenerateStatistics' >> tfdv.GenerateStatistics())

        _ = (stats
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 file_path_prefix=stats_output_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))
        _ = (stats
             | 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics,
                                                schema=schema)
             | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText(
                 file_path_prefix=anomalies_output_path,
                 shard_name_template='',
                 append_trailing_newlines=False))
Beispiel #8
0
    }, {
        'f1': [3],
        'f3': ['bbb'],
        'f4': [1]
    }]

    with beam.Pipeline(options=pipeline_options) as p:
        stats = (
            p
            | 'GetData' >> beam.Create(instances)
            #           | 'BatchDictionaries' >> beam.BatchElements(
            #                 min_batch_size = desired_batch_size,
            #                 max_batch_size = desired_batch_size)
            #           | 'CovertToArrowTables' >> beam.ParDo(
            #               BatchedDictsToArrowTable())
            | 'DecodeExamples' >> batch_util.BatchExamplesToArrowTables()
            | 'GenerateStatistics' >> tfdv.GenerateStatistics())

        _ = (stats
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 file_path_prefix=stats_output_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))

    # _ = (stats
    #| 'ValidateStatistics' >> beam.Map(tfdv.validate_statistics, schema=schema)
    #    | 'WriteAnomaliesOutput' >> beam.io.textio.WriteToText(
    #                                  file_path_prefix=anomalies_output_path,
    #                                   shard_name_template='',
    #                                 append_trailing_newlines=True))