def compute_stats(
    input_handle,
    stats_path,
    max_rows=None,
    for_eval=False,
    pipeline_args=None,
    publish_to_bq=None,
    metrics_dataset=None,
    metrics_table=None,
    project=None):
  """Computes statistics on the input data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    stats_path: Directory in which stats are materialized.
    max_rows: Number of rows to query from BigQuery
    for_eval: Query for eval set rows from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
  namespace = metrics_table
  pipeline = beam.Pipeline(argv=pipeline_args)
  metrics_monitor = None
  if publish_to_bq:
    metrics_monitor = MetricsReader(
        publish_to_bq=publish_to_bq,
        project_name=project,
        bq_table=metrics_table,
        bq_dataset=metrics_dataset,
        namespace=namespace,
        filters=MetricsFilter().with_namespace(namespace),
    )

  query = taxi.make_sql(
      table_name=input_handle, max_rows=max_rows, for_eval=for_eval)
  raw_data = (
      pipeline
      | 'ReadBigQuery' >> ReadFromBigQuery(
          query=query, project=project, use_standard_sql=True)
      | 'Measure time: Start' >> beam.ParDo(MeasureTime(namespace))
      | 'ConvertToTFDVInput' >> beam.Map(
          lambda x:
          {key: np.asarray([x[key]])
           for key in x if x[key] is not None}))

  _ = (
      raw_data
      | 'GenerateStatistics' >> tfdv.GenerateStatistics()
      | 'Measure time: End' >> beam.ParDo(MeasureTime(namespace))
      | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
          stats_path,
          shard_name_template='',
          coder=beam.coders.ProtoCoder(
              statistics_pb2.DatasetFeatureStatisticsList)))
  result = pipeline.run()
  result.wait_until_finish()
  if metrics_monitor:
    metrics_monitor.publish_metrics(result)
Beispiel #2
0
class LoadTest(unittest.TestCase):
  def parseTestPipelineOptions(self, options=None):
    if not options:
      options = self.input_options

    return {
        'numRecords': options.get('num_records'),
        'keySizeBytes': options.get('key_size'),
        'valueSizeBytes': options.get('value_size'),
        'hotKeyFraction': options.get('hot_key_fraction', 0),
        'numHotKeys': options.get('num_hot_keys', 0),
        'bundleSizeDistribution': {
            'type': options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline()
    self.input_options = json.loads(self.pipeline.get_option('input_options'))
    self.project_id = self.pipeline.get_option('project')

    self.metrics_dataset = self.pipeline.get_option('metrics_dataset')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')

    self.metrics_monitor = MetricsReader(
        publish_to_bq=self.pipeline.get_option('publish_to_big_query') ==
        'true',
        project_name=self.project_id,
        bq_table=self.metrics_namespace,
        bq_dataset=self.metrics_dataset,
        # Apply filter to prevent system metrics from being published
        filters=MetricsFilter().with_namespace(self.metrics_namespace)
    )

  def tearDown(self):
    result = self.pipeline.run()
    result.wait_until_finish()

    self.metrics_monitor.publish_metrics(result)

  def get_option_or_default(self, opt_name, default=0):
    """Returns a pipeline option or a default value if it was not provided.

    The returned value is converted to an integer.
    """
    option = self.pipeline.get_option(opt_name)
    try:
      return int(option)
    except TypeError:
      return default
    except ValueError as exc:
      self.fail(str(exc))
class LoadTest(unittest.TestCase):
  def parseTestPipelineOptions(self, options=None):
    if not options:
      options = self.input_options

    return {
        'numRecords': options.get('num_records'),
        'keySizeBytes': options.get('key_size'),
        'valueSizeBytes': options.get('value_size'),
        'hotKeyFraction': options.get('hot_key_fraction', 0),
        'numHotKeys': options.get('num_hot_keys', 0),
        'bundleSizeDistribution': {
            'type': options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline()
    self.input_options = json.loads(self.pipeline.get_option('input_options'))
    self.project_id = self.pipeline.get_option('project')

    self.publish_to_big_query = self.pipeline.get_option('publish_to_big_query')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')

    if not self.publish_to_big_query or self.publish_to_big_query != 'true':
      logging.info('Metrics will not be collected')
      self.metrics_monitor = None
    else:
      self.metrics_monitor = MetricsReader(
          project_name=self.pipeline.get_option('project'),
          bq_table=self.metrics_namespace,
          bq_dataset=self.pipeline.get_option('metrics_dataset'),
      )

  def tearDown(self):
    result = self.pipeline.run()
    result.wait_until_finish()

    if self.metrics_monitor:
      self.metrics_monitor.publish_metrics(result)
Beispiel #4
0
class LoadTest(unittest.TestCase):
  def parseTestPipelineOptions(self, options=None):
    if not options:
      options = self.input_options

    return {
        'numRecords': options.get('num_records'),
        'keySizeBytes': options.get('key_size'),
        'valueSizeBytes': options.get('value_size'),
        'bundleSizeDistribution': {
            'type': options.get(
                'bundle_size_distribution_type', 'const'
            ),
            'param': options.get('bundle_size_distribution_param', 0)
        },
        'forceNumInitialBundles': options.get(
            'force_initial_num_bundles', 0
        )
    }

  def setUp(self):
    self.pipeline = TestPipeline()
    self.input_options = json.loads(self.pipeline.get_option('input_options'))

    self.publish_to_big_query = self.pipeline.get_option('publish_to_big_query')
    self.metrics_namespace = self.pipeline.get_option('metrics_table')

    if not self.publish_to_big_query or self.publish_to_big_query != 'true':
      logging.info('Metrics will not be collected')
      self.metrics_monitor = None
    else:
      self.metrics_monitor = MetricsReader(
          project_name=self.pipeline.get_option('project'),
          bq_table=self.metrics_namespace,
          bq_dataset=self.pipeline.get_option('metrics_dataset'),
      )

  def tearDown(self):
    result = self.pipeline.run()
    result.wait_until_finish()

    if self.metrics_monitor:
      self.metrics_monitor.publish_metrics(result)
Beispiel #5
0
def process_tfma(schema_file,
                 big_query_table=None,
                 eval_model_dir=None,
                 max_eval_rows=None,
                 pipeline_args=None,
                 publish_to_bq=False,
                 project=None,
                 metrics_table=None,
                 metrics_dataset=None):
  """Runs a batch job to evaluate the eval_model against the given input.

  Args:
  schema_file: A file containing a text-serialized Schema that describes the
      eval data.
  big_query_table: A BigQuery table name specified as DATASET.TABLE which
      should be the input for evaluation. This can only be set if input_csv is
      None.
  eval_model_dir: A directory where the eval model is located.
  max_eval_rows: Number of rows to query from BigQuery.
  pipeline_args: additional DataflowRunner or DirectRunner args passed to
  the beam pipeline.
  publish_to_bq:
  project:
  metrics_dataset:
  metrics_table:

  Raises:
  ValueError: if input_csv and big_query_table are not specified correctly.
  """

  if big_query_table is None:
    raise ValueError(
        '--big_query_table should be provided.')

  slice_spec = [
      tfma.slicer.SingleSliceSpec(),
      tfma.slicer.SingleSliceSpec(columns=['trip_start_hour'])
  ]
  metrics_namespace = metrics_table

  schema = taxi.read_schema(schema_file)

  eval_shared_model = tfma.default_eval_shared_model(
      eval_saved_model_path=eval_model_dir,
      add_metrics_callbacks=[
          tfma.post_export_metrics.calibration_plot_and_prediction_histogram(),
          tfma.post_export_metrics.auc_plots()
      ])

  metrics_monitor = None
  if publish_to_bq:
    metrics_monitor = MetricsReader(
        publish_to_bq=publish_to_bq,
        project_name=project,
        bq_table=metrics_table,
        bq_dataset=metrics_dataset,
        filters=MetricsFilter().with_namespace(metrics_namespace)
    )

  pipeline = beam.Pipeline(argv=pipeline_args)

  query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True)
  raw_feature_spec = taxi.get_raw_feature_spec(schema)
  raw_data = (
      pipeline
      | 'ReadBigQuery' >> ReadFromBigQuery(query=query, project=project,
                                           use_standard_sql=True)
      | 'Measure time: Start' >> beam.ParDo(MeasureTime(metrics_namespace))
      | 'CleanData' >> beam.Map(lambda x: (
          taxi.clean_raw_data_dict(x, raw_feature_spec))))

  # Examples must be in clean tf-example format.
  coder = taxi.make_proto_coder(schema)
  # Prepare arguments for Extract, Evaluate and Write steps
  extractors = tfma.default_extractors(
      eval_shared_model=eval_shared_model,
      slice_spec=slice_spec,
      desired_batch_size=None,
      materialize=False)

  evaluators = tfma.default_evaluators(
      eval_shared_model=eval_shared_model,
      desired_batch_size=None,
      num_bootstrap_samples=1)
  _ = (
      raw_data
      | 'ToSerializedTFExample' >> beam.Map(coder.encode)
      | 'Extract Results' >> tfma.InputsToExtracts()
      | 'Extract and evaluate' >> tfma.ExtractAndEvaluate(
          extractors=extractors,
          evaluators=evaluators)
      | 'Map Evaluations to PCollection' >> MapEvalToPCollection()
      | 'Measure time: End' >> beam.ParDo(
          MeasureTime(metrics_namespace))
  )
  result = pipeline.run()
  result.wait_until_finish()
  if metrics_monitor:
    metrics_monitor.publish_metrics(result)
Beispiel #6
0
def transform_data(input_handle,
                   outfile_prefix,
                   working_dir,
                   schema_file,
                   transform_dir=None,
                   max_rows=None,
                   pipeline_args=None,
                   publish_to_bq=False,
                   project=None,
                   metrics_table=None,
                   metrics_dataset=None):
    """The main tf.transform method which analyzes and transforms data.

  Args:
    input_handle: BigQuery table name to process specified as DATASET.TABLE or
      path to csv file with input data.
    outfile_prefix: Filename prefix for emitted transformed examples
    working_dir: Directory in which transformed examples and transform function
      will be emitted.
    schema_file: An file path that contains a text-serialized TensorFlow
      metadata schema of the input data.
    transform_dir: Directory in which the transform output is located. If
      provided, this will load the transform_fn from disk instead of computing
      it over the data. Hint: this is useful for transforming eval data.
    max_rows: Number of rows to query from BigQuery
    pipeline_args: additional DataflowRunner or DirectRunner args passed to the
      beam pipeline.
  """
    def preprocessing_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.

    Args:
      inputs: map from feature keys to raw not-yet-transformed features.

    Returns:
      Map from string feature key to transformed feature operations.
    """
        outputs = {}
        for key in taxi.DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[taxi.transformed_name(key)] = transform.scale_to_z_score(
                _fill_in_missing(inputs[key]))

        for key in taxi.VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            outputs[taxi.transformed_name(
                key)] = transform.compute_and_apply_vocabulary(
                    _fill_in_missing(inputs[key]),
                    top_k=taxi.VOCAB_SIZE,
                    num_oov_buckets=taxi.OOV_SIZE)

        for key in taxi.BUCKET_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = transform.bucketize(
                _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT)

        for key in taxi.CATEGORICAL_FEATURE_KEYS:
            outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key])

        # Was this passenger a big tipper?
        taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY])
        tips = _fill_in_missing(inputs[taxi.LABEL_KEY])
        outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where(
            tf.is_nan(taxi_fare),
            tf.cast(tf.zeros_like(taxi_fare), tf.int64),
            # Test if the tip was > 20% of the fare.
            tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))),
                    tf.int64))

        return outputs

    namespace = metrics_table
    metrics_monitor = None
    if publish_to_bq:
        metrics_monitor = MetricsReader(
            publish_to_bq=publish_to_bq,
            project_name=project,
            bq_table=metrics_table,
            bq_dataset=metrics_dataset,
            namespace=namespace,
            filters=MetricsFilter().with_namespace(namespace))
    schema = taxi.read_schema(schema_file)
    raw_feature_spec = taxi.get_raw_feature_spec(schema)
    raw_schema = dataset_schema.from_feature_spec(raw_feature_spec)
    raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema)

    pipeline = beam.Pipeline(argv=pipeline_args)
    with tft_beam.Context(temp_dir=working_dir):
        query = taxi.make_sql(input_handle, max_rows, for_eval=False)
        raw_data = (
            pipeline
            | 'ReadBigQuery' >> ReadFromBigQuery(
                query=query, project=project, use_standard_sql=True)
            | 'Measure time: start' >> beam.ParDo(MeasureTime(namespace)))
        decode_transform = beam.Map(taxi.clean_raw_data_dict,
                                    raw_feature_spec=raw_feature_spec)

        if transform_dir is None:
            decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform
            transform_fn = (
                (decoded_data, raw_data_metadata) |
                ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn)))

            _ = (
                transform_fn |
                ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir)))
        else:
            transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir)

        # Shuffling the data before materialization will improve Training
        # effectiveness downstream. Here we shuffle the raw_data (as opposed to
        # decoded data) since it has a compact representation.
        shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle(
        )

        decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform
        (transformed_data, transformed_metadata) = (
            ((decoded_data, raw_data_metadata), transform_fn)
            | 'Transform' >> tft_beam.TransformDataset())

        coder = example_proto_coder.ExampleProtoCoder(
            transformed_metadata.schema)
        _ = (transformed_data
             | 'SerializeExamples' >> beam.Map(coder.encode)
             | 'Measure time: end' >> beam.ParDo(MeasureTime(namespace))
             | 'WriteExamples' >> beam.io.WriteToTFRecord(
                 os.path.join(working_dir, outfile_prefix),
                 file_name_suffix='.gz'))
    result = pipeline.run()
    result.wait_until_finish()
    if metrics_monitor:
        metrics_monitor.publish_metrics(result)
Beispiel #7
0
class LoadTest(unittest.TestCase):
    def parseTestPipelineOptions(self, options=None):
        if not options:
            options = self.input_options

        return {
            'numRecords': options.get('num_records'),
            'keySizeBytes': options.get('key_size'),
            'valueSizeBytes': options.get('value_size'),
            'hotKeyFraction': options.get('hot_key_fraction', 0),
            'numHotKeys': options.get('num_hot_keys', 0),
            'bundleSizeDistribution': {
                'type': options.get('bundle_size_distribution_type', 'const'),
                'param': options.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles': options.get('force_initial_num_bundles',
                                                  0)
        }

    def setUp(self):
        self.pipeline = TestPipeline()
        self.input_options = json.loads(
            self.pipeline.get_option('input_options'))
        self.project_id = self.pipeline.get_option('project')

        self.publish_to_big_query = self.pipeline.get_option(
            'publish_to_big_query')
        self.metrics_dataset = self.pipeline.get_option('metrics_dataset')
        self.metrics_namespace = self.pipeline.get_option('metrics_table')

        if not self.are_metrics_collected():
            logging.info('Metrics will not be collected')
            self.metrics_monitor = None
        else:
            self.metrics_monitor = MetricsReader(
                project_name=self.project_id,
                bq_table=self.metrics_namespace,
                bq_dataset=self.metrics_dataset,
            )

    def tearDown(self):
        result = self.pipeline.run()
        result.wait_until_finish()

        if self.metrics_monitor:
            self.metrics_monitor.publish_metrics(result)

    def apply_filter(self, allowed):
        """Prevents metrics from namespaces other than specified in the argument
    from being published."""
        if allowed:
            self.metrics_monitor.filters = MetricsFilter().with_namespaces(
                allowed)

    def get_option_or_default(self, opt_name, default=0):
        """Returns a pipeline option or a default value if it was not provided.

    The returned value is converted to an integer.
    """
        option = self.pipeline.get_option(opt_name)
        try:
            return int(option)
        except TypeError:
            return default
        except ValueError as exc:
            self.fail(str(exc))

    def are_metrics_collected(self):
        return self.publish_to_big_query != 'true' and None not in (
            self.project_id, self.metrics_dataset, self.metrics_namespace)
Beispiel #8
0
class LoadTest(object):
    """Base class for all integration and performance tests which export
  metrics to external databases: BigQuery or/and InfluxDB.

  Refer to :class:`~apache_beam.testing.load_tests.LoadTestOptions` for more
  information on the required pipeline options.

  If using InfluxDB with Basic HTTP authentication enabled, provide the
  following environment options: `INFLUXDB_USER` and `INFLUXDB_USER_PASSWORD`.
  """
    def __init__(self, metrics_namespace=None):
        # Be sure to set blocking to false for timeout_ms to work properly
        self.pipeline = TestPipeline(is_integration_test=True, blocking=False)
        assert not self.pipeline.blocking

        options = self.pipeline.get_pipeline_options().view_as(LoadTestOptions)
        self.timeout_ms = options.timeout_ms
        self.input_options = options.input_options

        if metrics_namespace:
            self.metrics_namespace = metrics_namespace
        else:
            self.metrics_namespace = options.metrics_table \
              if options.metrics_table else 'default'

        publish_to_bq = options.publish_to_big_query
        if publish_to_bq is None:
            logging.info(
                'Missing --publish_to_big_query option. Metrics will not '
                'be published to BigQuery.')
        if options.input_options is None:
            logging.error('--input_options argument is required.')
            sys.exit(1)

        gcloud_options = self.pipeline.get_pipeline_options().view_as(
            GoogleCloudOptions)
        self.project_id = gcloud_options.project

        self._metrics_monitor = MetricsReader(
            publish_to_bq=publish_to_bq,
            project_name=self.project_id,
            bq_table=options.metrics_table,
            bq_dataset=options.metrics_dataset,
            namespace=self.metrics_namespace,
            influxdb_options=InfluxDBMetricsPublisherOptions(
                options.influx_measurement, options.influx_db_name,
                options.influx_hostname, os.getenv('INFLUXDB_USER'),
                os.getenv('INFLUXDB_USER_PASSWORD')),
            # Apply filter to prevent system metrics from being published
            filters=MetricsFilter().with_namespace(self.metrics_namespace))

    def test(self):
        """An abstract method where the pipeline definition should be put."""
        pass

    def cleanup(self):
        """An abstract method that executes after the test method."""
        pass

    def run(self):
        try:
            self.test()
            if not hasattr(self, 'result'):
                self.result = self.pipeline.run()
                # Defaults to waiting forever, unless timeout_ms has been set
                self.result.wait_until_finish(duration=self.timeout_ms)
            self._metrics_monitor.publish_metrics(self.result)
        finally:
            self.cleanup()

    def parse_synthetic_source_options(self, options=None):
        if not options:
            options = self.input_options
        return {
            'numRecords': options.get('num_records'),
            'keySizeBytes': options.get('key_size'),
            'valueSizeBytes': options.get('value_size'),
            'hotKeyFraction': options.get('hot_key_fraction', 0),
            'numHotKeys': options.get('num_hot_keys', 0),
            'bundleSizeDistribution': {
                'type': options.get('bundle_size_distribution_type', 'const'),
                'param': options.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles': options.get('force_initial_num_bundles',
                                                  0)
        }

    def get_option_or_default(self, opt_name, default=0):
        """Returns a testing option or a default value if it was not provided.

    The returned value is cast to the type of the default value.
    """
        option = self.pipeline.get_option(opt_name,
                                          bool_option=type(default) == bool)
        if option is None:
            return default
        try:
            return type(default)(option)
        except:
            raise
Beispiel #9
0
class LoadTest(object):
    def __init__(self):
        self.pipeline = TestPipeline(is_integration_test=True)

        load_test_options = self.pipeline.get_pipeline_options().view_as(
            LoadTestOptions)
        self.input_options = load_test_options.input_options
        self.metrics_namespace = load_test_options.metrics_table or 'default'
        publish_to_bq = load_test_options.publish_to_big_query
        if publish_to_bq is None:
            logging.info(
                'Missing --publish_to_big_query option. Metrics will not '
                'be published to BigQuery.')
        if load_test_options.input_options is None:
            logging.error('--input_options argument is required.')
            sys.exit(1)

        gcloud_options = self.pipeline.get_pipeline_options().view_as(
            GoogleCloudOptions)
        self.project_id = gcloud_options.project

        self._metrics_monitor = MetricsReader(
            publish_to_bq=publish_to_bq,
            project_name=self.project_id,
            bq_table=load_test_options.metrics_table,
            bq_dataset=load_test_options.metrics_dataset,
            # Apply filter to prevent system metrics from being published
            filters=MetricsFilter().with_namespace(self.metrics_namespace))

    def test(self):
        """An abstract method where the pipeline definition should be put."""
        pass

    def cleanup(self):
        """An abstract method that executes after the test method."""
        pass

    def run(self):
        try:
            self.test()
            if not hasattr(self, 'result'):
                self.result = self.pipeline.run()
                self.result.wait_until_finish()
            self._metrics_monitor.publish_metrics(self.result)
        finally:
            self.cleanup()

    def parse_synthetic_source_options(self, options=None):
        if not options:
            options = self.input_options
        return {
            'numRecords': options.get('num_records'),
            'keySizeBytes': options.get('key_size'),
            'valueSizeBytes': options.get('value_size'),
            'hotKeyFraction': options.get('hot_key_fraction', 0),
            'numHotKeys': options.get('num_hot_keys', 0),
            'bundleSizeDistribution': {
                'type': options.get('bundle_size_distribution_type', 'const'),
                'param': options.get('bundle_size_distribution_param', 0)
            },
            'forceNumInitialBundles': options.get('force_initial_num_bundles',
                                                  0)
        }

    def get_option_or_default(self, opt_name, default=0):
        """Returns a pipeline option or a default value if it was not provided.

    The returned value is converted to an integer.
    """
        option = self.pipeline.get_option(opt_name)
        try:
            return int(option)
        except TypeError:
            return default