def compute_stats( input_handle, stats_path, max_rows=None, for_eval=False, pipeline_args=None, publish_to_bq=None, metrics_dataset=None, metrics_table=None, project=None): """Computes statistics on the input data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. stats_path: Directory in which stats are materialized. max_rows: Number of rows to query from BigQuery for_eval: Query for eval set rows from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ namespace = metrics_table pipeline = beam.Pipeline(argv=pipeline_args) metrics_monitor = None if publish_to_bq: metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=project, bq_table=metrics_table, bq_dataset=metrics_dataset, namespace=namespace, filters=MetricsFilter().with_namespace(namespace), ) query = taxi.make_sql( table_name=input_handle, max_rows=max_rows, for_eval=for_eval) raw_data = ( pipeline | 'ReadBigQuery' >> ReadFromBigQuery( query=query, project=project, use_standard_sql=True) | 'Measure time: Start' >> beam.ParDo(MeasureTime(namespace)) | 'ConvertToTFDVInput' >> beam.Map( lambda x: {key: np.asarray([x[key]]) for key in x if x[key] is not None})) _ = ( raw_data | 'GenerateStatistics' >> tfdv.GenerateStatistics() | 'Measure time: End' >> beam.ParDo(MeasureTime(namespace)) | 'WriteStatsOutput' >> beam.io.WriteToTFRecord( stats_path, shard_name_template='', coder=beam.coders.ProtoCoder( statistics_pb2.DatasetFeatureStatisticsList))) result = pipeline.run() result.wait_until_finish() if metrics_monitor: metrics_monitor.publish_metrics(result)
class LoadTest(unittest.TestCase): def parseTestPipelineOptions(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'hotKeyFraction': options.get('hot_key_fraction', 0), 'numHotKeys': options.get('num_hot_keys', 0), 'bundleSizeDistribution': { 'type': options.get( 'bundle_size_distribution_type', 'const' ), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads(self.pipeline.get_option('input_options')) self.project_id = self.pipeline.get_option('project') self.metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_namespace = self.pipeline.get_option('metrics_table') self.metrics_monitor = MetricsReader( publish_to_bq=self.pipeline.get_option('publish_to_big_query') == 'true', project_name=self.project_id, bq_table=self.metrics_namespace, bq_dataset=self.metrics_dataset, # Apply filter to prevent system metrics from being published filters=MetricsFilter().with_namespace(self.metrics_namespace) ) def tearDown(self): result = self.pipeline.run() result.wait_until_finish() self.metrics_monitor.publish_metrics(result) def get_option_or_default(self, opt_name, default=0): """Returns a pipeline option or a default value if it was not provided. The returned value is converted to an integer. """ option = self.pipeline.get_option(opt_name) try: return int(option) except TypeError: return default except ValueError as exc: self.fail(str(exc))
class LoadTest(unittest.TestCase): def parseTestPipelineOptions(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'hotKeyFraction': options.get('hot_key_fraction', 0), 'numHotKeys': options.get('num_hot_keys', 0), 'bundleSizeDistribution': { 'type': options.get( 'bundle_size_distribution_type', 'const' ), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads(self.pipeline.get_option('input_options')) self.project_id = self.pipeline.get_option('project') self.publish_to_big_query = self.pipeline.get_option('publish_to_big_query') self.metrics_namespace = self.pipeline.get_option('metrics_table') if not self.publish_to_big_query or self.publish_to_big_query != 'true': logging.info('Metrics will not be collected') self.metrics_monitor = None else: self.metrics_monitor = MetricsReader( project_name=self.pipeline.get_option('project'), bq_table=self.metrics_namespace, bq_dataset=self.pipeline.get_option('metrics_dataset'), ) def tearDown(self): result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor: self.metrics_monitor.publish_metrics(result)
class LoadTest(unittest.TestCase): def parseTestPipelineOptions(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'bundleSizeDistribution': { 'type': options.get( 'bundle_size_distribution_type', 'const' ), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get( 'force_initial_num_bundles', 0 ) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads(self.pipeline.get_option('input_options')) self.publish_to_big_query = self.pipeline.get_option('publish_to_big_query') self.metrics_namespace = self.pipeline.get_option('metrics_table') if not self.publish_to_big_query or self.publish_to_big_query != 'true': logging.info('Metrics will not be collected') self.metrics_monitor = None else: self.metrics_monitor = MetricsReader( project_name=self.pipeline.get_option('project'), bq_table=self.metrics_namespace, bq_dataset=self.pipeline.get_option('metrics_dataset'), ) def tearDown(self): result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor: self.metrics_monitor.publish_metrics(result)
def process_tfma(schema_file, big_query_table=None, eval_model_dir=None, max_eval_rows=None, pipeline_args=None, publish_to_bq=False, project=None, metrics_table=None, metrics_dataset=None): """Runs a batch job to evaluate the eval_model against the given input. Args: schema_file: A file containing a text-serialized Schema that describes the eval data. big_query_table: A BigQuery table name specified as DATASET.TABLE which should be the input for evaluation. This can only be set if input_csv is None. eval_model_dir: A directory where the eval model is located. max_eval_rows: Number of rows to query from BigQuery. pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. publish_to_bq: project: metrics_dataset: metrics_table: Raises: ValueError: if input_csv and big_query_table are not specified correctly. """ if big_query_table is None: raise ValueError( '--big_query_table should be provided.') slice_spec = [ tfma.slicer.SingleSliceSpec(), tfma.slicer.SingleSliceSpec(columns=['trip_start_hour']) ] metrics_namespace = metrics_table schema = taxi.read_schema(schema_file) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_dir, add_metrics_callbacks=[ tfma.post_export_metrics.calibration_plot_and_prediction_histogram(), tfma.post_export_metrics.auc_plots() ]) metrics_monitor = None if publish_to_bq: metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=project, bq_table=metrics_table, bq_dataset=metrics_dataset, filters=MetricsFilter().with_namespace(metrics_namespace) ) pipeline = beam.Pipeline(argv=pipeline_args) query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_data = ( pipeline | 'ReadBigQuery' >> ReadFromBigQuery(query=query, project=project, use_standard_sql=True) | 'Measure time: Start' >> beam.ParDo(MeasureTime(metrics_namespace)) | 'CleanData' >> beam.Map(lambda x: ( taxi.clean_raw_data_dict(x, raw_feature_spec)))) # Examples must be in clean tf-example format. coder = taxi.make_proto_coder(schema) # Prepare arguments for Extract, Evaluate and Write steps extractors = tfma.default_extractors( eval_shared_model=eval_shared_model, slice_spec=slice_spec, desired_batch_size=None, materialize=False) evaluators = tfma.default_evaluators( eval_shared_model=eval_shared_model, desired_batch_size=None, num_bootstrap_samples=1) _ = ( raw_data | 'ToSerializedTFExample' >> beam.Map(coder.encode) | 'Extract Results' >> tfma.InputsToExtracts() | 'Extract and evaluate' >> tfma.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators) | 'Map Evaluations to PCollection' >> MapEvalToPCollection() | 'Measure time: End' >> beam.ParDo( MeasureTime(metrics_namespace)) ) result = pipeline.run() result.wait_until_finish() if metrics_monitor: metrics_monitor.publish_metrics(result)
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None, publish_to_bq=False, project=None, metrics_table=None, metrics_dataset=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[taxi.transformed_name( key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs namespace = metrics_table metrics_monitor = None if publish_to_bq: metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=project, bq_table=metrics_table, bq_dataset=metrics_dataset, namespace=namespace, filters=MetricsFilter().with_namespace(namespace)) schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) pipeline = beam.Pipeline(argv=pipeline_args) with tft_beam.Context(temp_dir=working_dir): query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = ( pipeline | 'ReadBigQuery' >> ReadFromBigQuery( query=query, project=project, use_standard_sql=True) | 'Measure time: start' >> beam.ParDo(MeasureTime(namespace))) decode_transform = beam.Map(taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec) if transform_dir is None: decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = ( transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))) else: transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. Here we shuffle the raw_data (as opposed to # decoded data) since it has a compact representation. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform (transformed_data, transformed_metadata) = ( ((decoded_data, raw_data_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'Measure time: end' >> beam.ParDo(MeasureTime(namespace)) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz')) result = pipeline.run() result.wait_until_finish() if metrics_monitor: metrics_monitor.publish_metrics(result)
class LoadTest(unittest.TestCase): def parseTestPipelineOptions(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'hotKeyFraction': options.get('hot_key_fraction', 0), 'numHotKeys': options.get('num_hot_keys', 0), 'bundleSizeDistribution': { 'type': options.get('bundle_size_distribution_type', 'const'), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get('force_initial_num_bundles', 0) } def setUp(self): self.pipeline = TestPipeline() self.input_options = json.loads( self.pipeline.get_option('input_options')) self.project_id = self.pipeline.get_option('project') self.publish_to_big_query = self.pipeline.get_option( 'publish_to_big_query') self.metrics_dataset = self.pipeline.get_option('metrics_dataset') self.metrics_namespace = self.pipeline.get_option('metrics_table') if not self.are_metrics_collected(): logging.info('Metrics will not be collected') self.metrics_monitor = None else: self.metrics_monitor = MetricsReader( project_name=self.project_id, bq_table=self.metrics_namespace, bq_dataset=self.metrics_dataset, ) def tearDown(self): result = self.pipeline.run() result.wait_until_finish() if self.metrics_monitor: self.metrics_monitor.publish_metrics(result) def apply_filter(self, allowed): """Prevents metrics from namespaces other than specified in the argument from being published.""" if allowed: self.metrics_monitor.filters = MetricsFilter().with_namespaces( allowed) def get_option_or_default(self, opt_name, default=0): """Returns a pipeline option or a default value if it was not provided. The returned value is converted to an integer. """ option = self.pipeline.get_option(opt_name) try: return int(option) except TypeError: return default except ValueError as exc: self.fail(str(exc)) def are_metrics_collected(self): return self.publish_to_big_query != 'true' and None not in ( self.project_id, self.metrics_dataset, self.metrics_namespace)
class LoadTest(object): """Base class for all integration and performance tests which export metrics to external databases: BigQuery or/and InfluxDB. Refer to :class:`~apache_beam.testing.load_tests.LoadTestOptions` for more information on the required pipeline options. If using InfluxDB with Basic HTTP authentication enabled, provide the following environment options: `INFLUXDB_USER` and `INFLUXDB_USER_PASSWORD`. """ def __init__(self, metrics_namespace=None): # Be sure to set blocking to false for timeout_ms to work properly self.pipeline = TestPipeline(is_integration_test=True, blocking=False) assert not self.pipeline.blocking options = self.pipeline.get_pipeline_options().view_as(LoadTestOptions) self.timeout_ms = options.timeout_ms self.input_options = options.input_options if metrics_namespace: self.metrics_namespace = metrics_namespace else: self.metrics_namespace = options.metrics_table \ if options.metrics_table else 'default' publish_to_bq = options.publish_to_big_query if publish_to_bq is None: logging.info( 'Missing --publish_to_big_query option. Metrics will not ' 'be published to BigQuery.') if options.input_options is None: logging.error('--input_options argument is required.') sys.exit(1) gcloud_options = self.pipeline.get_pipeline_options().view_as( GoogleCloudOptions) self.project_id = gcloud_options.project self._metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=self.project_id, bq_table=options.metrics_table, bq_dataset=options.metrics_dataset, namespace=self.metrics_namespace, influxdb_options=InfluxDBMetricsPublisherOptions( options.influx_measurement, options.influx_db_name, options.influx_hostname, os.getenv('INFLUXDB_USER'), os.getenv('INFLUXDB_USER_PASSWORD')), # Apply filter to prevent system metrics from being published filters=MetricsFilter().with_namespace(self.metrics_namespace)) def test(self): """An abstract method where the pipeline definition should be put.""" pass def cleanup(self): """An abstract method that executes after the test method.""" pass def run(self): try: self.test() if not hasattr(self, 'result'): self.result = self.pipeline.run() # Defaults to waiting forever, unless timeout_ms has been set self.result.wait_until_finish(duration=self.timeout_ms) self._metrics_monitor.publish_metrics(self.result) finally: self.cleanup() def parse_synthetic_source_options(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'hotKeyFraction': options.get('hot_key_fraction', 0), 'numHotKeys': options.get('num_hot_keys', 0), 'bundleSizeDistribution': { 'type': options.get('bundle_size_distribution_type', 'const'), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get('force_initial_num_bundles', 0) } def get_option_or_default(self, opt_name, default=0): """Returns a testing option or a default value if it was not provided. The returned value is cast to the type of the default value. """ option = self.pipeline.get_option(opt_name, bool_option=type(default) == bool) if option is None: return default try: return type(default)(option) except: raise
class LoadTest(object): def __init__(self): self.pipeline = TestPipeline(is_integration_test=True) load_test_options = self.pipeline.get_pipeline_options().view_as( LoadTestOptions) self.input_options = load_test_options.input_options self.metrics_namespace = load_test_options.metrics_table or 'default' publish_to_bq = load_test_options.publish_to_big_query if publish_to_bq is None: logging.info( 'Missing --publish_to_big_query option. Metrics will not ' 'be published to BigQuery.') if load_test_options.input_options is None: logging.error('--input_options argument is required.') sys.exit(1) gcloud_options = self.pipeline.get_pipeline_options().view_as( GoogleCloudOptions) self.project_id = gcloud_options.project self._metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=self.project_id, bq_table=load_test_options.metrics_table, bq_dataset=load_test_options.metrics_dataset, # Apply filter to prevent system metrics from being published filters=MetricsFilter().with_namespace(self.metrics_namespace)) def test(self): """An abstract method where the pipeline definition should be put.""" pass def cleanup(self): """An abstract method that executes after the test method.""" pass def run(self): try: self.test() if not hasattr(self, 'result'): self.result = self.pipeline.run() self.result.wait_until_finish() self._metrics_monitor.publish_metrics(self.result) finally: self.cleanup() def parse_synthetic_source_options(self, options=None): if not options: options = self.input_options return { 'numRecords': options.get('num_records'), 'keySizeBytes': options.get('key_size'), 'valueSizeBytes': options.get('value_size'), 'hotKeyFraction': options.get('hot_key_fraction', 0), 'numHotKeys': options.get('num_hot_keys', 0), 'bundleSizeDistribution': { 'type': options.get('bundle_size_distribution_type', 'const'), 'param': options.get('bundle_size_distribution_param', 0) }, 'forceNumInitialBundles': options.get('force_initial_num_bundles', 0) } def get_option_or_default(self, opt_name, default=0): """Returns a pipeline option or a default value if it was not provided. The returned value is converted to an integer. """ option = self.pipeline.get_option(opt_name) try: return int(option) except TypeError: return default