def main(argv=None): options = PipelineOptions(argv) options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) input1 = p | 'Input1' >> beam.Create([1, 2, 3], reshuffle=False) input2 = p | 'Input2' >> beam.Create([4, 5, 6], reshuffle=False) output_a, output_b = ( (input1, input2) | 'Flatten' >> beam.Flatten() | 'Split' >> beam.ParDo(MultiOutputDoFn()).with_outputs( MultiOutputDoFn.OUTPUT_TAG_B, main=MultiOutputDoFn.OUTPUT_TAG_A)) # IdentityA and IdentityB are to set output types and set right coders for # Dataflow Runner. You may see type inference error (BEAM-4132) without them. (output_a | 'IdentityA' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int]) | 'PrintA' >> beam.ParDo(StatefulPrintDoFn('PrintA'))) (output_b | 'IdentityB' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int]) | 'PrintB' >> beam.ParDo(StatefulPrintDoFn('PrintB'))) p.run()
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) (p | GenerateSequence( 0, stop=100, expansion_service=BEAM_IO_EXPANSION_SERVICE) | Map(lambda x: logging.info(x))) p.run()
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) (p | Create(["a", "b", "c", "d", "e"], reshuffle=False) | Print("hello", expansion_service(options))) p.run()
def handle_return(self, pipeline: beam.Pipeline) -> None: """Appends a beam.io.WriteToParquet at the end of a beam pipeline and therefore persists the results. Args: pipeline: A beam.pipeline object. """ # TODO [ENG-139]: Implement beam writing super().handle_return(pipeline) pipeline | beam.ParDo() pipeline.run()
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) input = p | 'Input' >> beam.Create([1, 2, 3], reshuffle=False) output1 = input | 'Output1' >> beam.Map(lambda x, side: (x, side), AsList(input)) input | 'Output2' >> beam.Map( lambda x, side: logging.info('x: %s, side: %s', x, side), AsList(output1)) p.run()
def main(argv=None): options = PipelineOptions(argv) kafka_options = options.view_as(KafkaReadOptions) p = Pipeline(options=options) (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': kafka_options.bootstrap_servers }, topics=[kafka_options.topic]) | Map(lambda x: logging.info('kafka element: %s', x))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True project = options.view_as(GoogleCloudOptions).project p = Pipeline(options=options) (p | Create(EN_TEXTS) | ParDo(TranslateDoFn(project, SOURCE_LANGUAGE_CODE, TARGET_LANGUAGE_CODE)) | Map(print_translation)) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) (p | Create(list(range(NUM_SHARDS))) | FlatMap(lambda _: (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD))) | WithKeys('') | ParDo(BigBagDoFn())) p.run()
def pipeline_options_remote(argv): """Creating a Pipeline using a PipelineOptions object for remote execution.""" from apache_beam import Pipeline from apache_beam.options.pipeline_options import PipelineOptions # [START pipeline_options_create] options = PipelineOptions(flags=argv) # [END pipeline_options_create] # [START pipeline_options_define_custom] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input') parser.add_argument('--output') # [END pipeline_options_define_custom] from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import StandardOptions # [START pipeline_options_dataflow_service] # Create and set your PipelineOptions. options = PipelineOptions(flags=argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://my-bucket/binaries' google_cloud_options.temp_location = 'gs://my-bucket/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # [END pipeline_options_dataflow_service] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output p = TestPipeline() # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output) p.run()
def test_job_python_from_python_it(self): @ptransform.PTransform.register_urn('simple', None) class SimpleTransform(ptransform.PTransform): def expand(self, pcoll): return pcoll | beam.Map(lambda x: 'Simple(%s)' % x) def to_runner_api_parameter(self, unused_context): return 'simple', None @staticmethod def from_runner_api_parameter(_0, _1, _2): return SimpleTransform() pipeline = TestPipeline(is_integration_test=True) res = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)'])) proto_pipeline, _ = pipeline.to_runner_api(return_context=True) pipeline_from_proto = Pipeline.from_runner_api(proto_pipeline, pipeline.runner, pipeline._options) pipeline_from_proto.run().wait_until_finish()
def main(): # bq_source = BigQuerySource(query=""" # SELECT created_at, text # FROM got_sentiment.got_tweets # """, # validate=False, coder=None, # use_standard_sql=True, flatten_results=True, # kms_key=None) # Removed attributes from ReadFromPubSub: # with_attributes=False, # timestamp_attribute='created_at' # Create the Pipeline with the specified options. with Pipeline(options=options) as p: results = ( p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC) | 'Window' >> WindowInto(window.FixedWindows(60)) | 'Emit_needed_values' >> FlatMap(emit_values, entity_map) | 'Combine' >> CombinePerKey(EntityScoreCombine()) | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn()) | 'FormatForWrite' >> Map(format_for_write) | 'Write' >> WriteToBigQuery('streaming_scores', dataset=BQ_DATASET, project=PROJECT_ID, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND', batch_size=20))
def pipeline_options_local(argv): """Creating a Pipeline using a PipelineOptions object for local execution.""" from apache_beam import Pipeline from apache_beam.options.pipeline_options import PipelineOptions options = PipelineOptions(flags=argv) # [START pipeline_options_define_custom_with_help_and_default] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='Output for the pipeline', default='gs://my-bucket/output') # [END pipeline_options_define_custom_with_help_and_default] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output # [START pipeline_options_local] # Create and set your Pipeline Options. options = PipelineOptions() with Pipeline(options=options) as p: # [END pipeline_options_local] with TestPipeline() as p: # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output)
def test_pipeline_generation(self): @ptransform.PTransform.register_urn('simple', None) class SimpleTransform(ptransform.PTransform): def expand(self, pcoll): return pcoll | 'TestLabel' >> beam.Map( lambda x: 'Simple(%s)' % x) def to_runner_api_parameter(self, unused_context): return 'simple', None @staticmethod def from_runner_api_parameter(unused_parameter, unused_context): return SimpleTransform() pipeline = beam.Pipeline() res = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)'])) proto, _ = pipeline.to_runner_api(return_context=True) pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner, pipeline._options) # Original pipeline has the un-expanded external transform self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts) # new pipeline has the expanded external transform self.assertNotEqual( [], pipeline_from_proto.transforms_stack[0].parts[1].parts) self.assertEqual( u'ExternalTransform(simple)/TestLabel', pipeline_from_proto. transforms_stack[0].parts[1].parts[0].full_label)
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) start = 1 end = 100 (p | 'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1))) | 'Sum' >> CombineGlobally(sum) | 'Print' >> ParDo(lambda total: logging.info('Sum from 1 to 100 is %s', total))) p.run()
def run(argv=None): options = PipelineOptions(argv) options.view_as(SetupOptions).save_main_session = True with Pipeline(options=options) as p: (p | beam.Create([None]) | beam.ParDo(connect_and_query))
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True project = options.view_as(GoogleCloudOptions).project assert project is not None, '"project" is not specified.' source_code = 'en-US' target_code = 'ja' texts = ['Hello', 'Thank you', 'Goodbye'] p = Pipeline(options=options) (p | 'Texts' >> Create(texts) | 'Translate' >> ParDo(Translate(project, source_code, target_code)) | 'Print' >> Map(lambda pair: logging.info('%s -> %s', pair[0], pair[1]))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS) # Big batch size with 1 minute trigger | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1])))) run = p.run() run.wait_until_finish()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) start = 1 end = 10 (p | 'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1))) | 'ToXml' >> ParDo(ToXmlDoFn()) # If a job finishes too quickly, worker VMs can be shutdown before they send # logs in local files to Cloud Logging. Adding 30s sleep to avoid this | 'Sleep30s' >> ParDo(Sleep(30)) | 'Print' >> ParDo(lambda xml: logging.info(xml)) ) p.run()
def main(argv=None): options = PipelineOptions(argv) topic = options.view_as(PubSubTopicOptions).topic p = Pipeline(options=options) (p # This is an external transform # `apache_beam.io.external.gcp.pubsub.ReadFromPubSub`. This is different from # `apache_beam.io.gcp.pubsub.ReadFromPubSub` which is native transform used # for most cases. # # If you set expansion_service as BeamJarExpansionService( # 'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'), it will # fail as the beam jar has no dependency for DirectRunner. As a workaround, # specify custom expansion service jar in this project. | ReadFromPubSub(topic=topic, with_attributes=True, expansion_service=expansion_service(options)) | Map(lambda message: logging.info("message: %s", message))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True opt = options.view_as(_Options) inputs = opt.inputs output_prefix = opt.output_prefix or os.path.join( options.view_as(GoogleCloudOptions).temp_location, 'output') shards = opt.shards p = Pipeline(options=options) def generate(n): yield from range(n * _ELEMENTS_PER_INPUT, (n + 1) * _ELEMENTS_PER_INPUT) (p | Create(range(inputs)) | ParDo(generate).with_output_types(int) | WriteToText(output_prefix, num_shards=shards)) p.run()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('input_topic', type=str, help="Input Pub/Sub topic name.") parser.add_argument( 'output_table', type=str, help="Output BigQuery table name. Example: project.db.name") parser.add_argument('--model_project', type=str, help="Google Project ID with model.") parser.add_argument('--model_name', type=str, help="Name of the Google AI Platform model name.") parser.add_argument('--model_region', type=str, help="AI Platform region name.") parser.add_argument('--model_version', type=str, help="AI Platform model version.") known_args, pipeline_args = parser.parse_known_args(argv) _topic_comp = known_args.input_topic.split('/') if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[ 2] != 'topics': raise ValueError("Table topic name has inappropriate format.") if len(known_args.output_table.split('.')) != 2: raise ValueError("Table name has inappropriate format.") inf_args = [ known_args.model_project, known_args.model_name, known_args.model_region, known_args.model_version ] options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = Pipeline(options=options) _ = (p | 'read from pub/sub' >> ReadFromPubSub( known_args.input_topic).with_output_types(bytes) | 'windowing' >> WindowInto(window.FixedWindows(10, 0)) | 'convert to dict' >> Map(json.loads) | 'pre processing' >> PreProcessing() | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args)) | 'format message' >> Map(formatter) | 'write to BQ' >> WriteToBigQuery( table=known_args.output_table, schema=build_bq_schema(), create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) if os.environ.get('DEPLOY'): p.run( ) # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running. else: p.run().wait_until_finish()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | WindowInto(GlobalWindows(), trigger=Repeatedly( AfterAny(AfterCount(BATCH_SIZE), AfterProcessingTime(BUFFERING_SECS))), accumulation_mode=AccumulationMode.DISCARDING) | GroupByKey() | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1] )))) run = p.run() run.wait_until_finish()
def pipeline_options_local(argv): """Creating a Pipeline using a PipelineOptions object for local execution.""" from apache_beam import Pipeline from apache_beam.options.pipeline_options import PipelineOptions options = PipelineOptions(flags=argv) # [START pipeline_options_define_custom_with_help_and_default] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='Output for the pipeline', default='gs://my-bucket/output') # [END pipeline_options_define_custom_with_help_and_default] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output # [START pipeline_options_local] # Create and set your Pipeline Options. options = PipelineOptions() p = Pipeline(options=options) # [END pipeline_options_local] p = TestPipeline() # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output) p.run()
def run(input_topic, num_shards, window_size): # Set `save_main_session` to True so DoFns can access globally imported modules. pipeline_options = PipelineOptions(pipeline_args, streaming=True, save_main_session=True) custom_options = pipeline_options.view_as(CustomPipelineOptions) with Pipeline(options=custom_options) as pipeline: (pipeline # Because `timestamp_attribute` is unspecified in `ReadFromPubSub`, Beam # binds the publish time returned by the Pub/Sub server for each message # to the element's timestamp parameter, accessible via `DoFn.TimestampParam`. # https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.pubsub.html#apache_beam.io.gcp.pubsub.ReadFromPubSub | "Read from Pub/Sub" >> io.ReadFromPubSub(topic=input_topic) | "Window into" >> GroupMessagesByFixedWindows(window_size, num_shards) | "Write to GCS" >> ParDo(WriteToGCS(custom_options.output_path)))
def test_pipeline_generation(self): pipeline = beam.Pipeline() res = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)'])) proto, _ = pipeline.to_runner_api(return_context=True) pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner, pipeline._options) # Original pipeline has the un-expanded external transform self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts) # new pipeline has the expanded external transform self.assertNotEqual( [], pipeline_from_proto.transforms_stack[0].parts[1].parts) self.assertEqual( u'ExternalTransform(simple)/TestLabel', pipeline_from_proto. transforms_stack[0].parts[1].parts[0].full_label)
def tensorize_sql_fields(pipeline: Pipeline, output_path: str, sql_dataset: str, tensor_type: str): if tensor_type == 'categorical': query = _get_categorical_query(sql_dataset) elif tensor_type == 'continuous': query = _get_continuous_query(sql_dataset) elif tensor_type == 'icd': query = _get_icd_query(sql_dataset) elif tensor_type == 'disease': query = _get_disease_query(sql_dataset) elif tensor_type == 'phecode_disease': query = _get_phecode_query(sql_dataset) elif tensor_type == 'death': query = _get_death_and_censor_query(sql_dataset) else: raise ValueError( "Can tensorize only categorical or continuous fields, got ", tensor_type) bigquery_source = beam.io.BigQuerySource(query=query, use_standard_sql=True) # Query table in BQ steps = ( pipeline | 'QueryTables' >> beam.io.Read(bigquery_source) # Each row is a dictionary where the keys are the BigQuery columns | 'CreateKey' >> beam.Map(lambda row: (row['sample_id'], row)) # Group by key | 'GroupByKey' >> beam.GroupByKey() # Format into hd5 files and upload to GCS | 'CreateHd5sAndUploadToGCS' >> beam.Map(write_tensor_from_sql, output_path, tensor_type)) result = pipeline.run() result.wait_until_finish()
def test_pipeline_generation(self): pipeline = beam.Pipeline() _ = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'beam:transforms:xlang:test:prefix', ImplicitSchemaPayloadBuilder({'data': u'0'}), expansion_service.ExpansionServiceServicer())) proto, _ = pipeline.to_runner_api(return_context=True) pipeline_from_proto = Pipeline.from_runner_api(proto, pipeline.runner, pipeline._options) # Original pipeline has the un-expanded external transform self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts) # new pipeline has the expanded external transform self.assertNotEqual( [], pipeline_from_proto.transforms_stack[0].parts[1].parts) self.assertEqual( u'ExternalTransform(beam:transforms:xlang:test:prefix)/TestLabel', pipeline_from_proto.transforms_stack[0].parts[1].parts[0]. full_label)
def test_metrics(self): """Run a simple DoFn that increments a counter, and verify that its expected value is written to a temporary file by the FileReporter""" counter_name = 'elem_counter' class DoFn(beam.DoFn): def __init__(self): self.counter = Metrics.counter(self.__class__, counter_name) logging.info('counter: %s' % self.counter.metric_name) def process(self, v): self.counter.inc() options = self.create_options() # Test only supports parallelism of 1 options._all_options['parallelism'] = 1 n = 100 with Pipeline(self.get_runner(), options) as p: # pylint: disable=expression-not-assigned (p | beam.Create(list(range(n))) | beam.ParDo(DoFn())) with open(self.test_metrics_path, 'r') as f: lines = [line for line in f.readlines() if counter_name in line] self.assertEqual( len(lines), 1, msg='Expected 1 line matching "{}":\n{}'.format( counter_name, '\n'.join(lines)) ) line = lines[0] self.assertTrue( '{}: {}'.format(counter_name in line, n), msg='Failed to find expected counter {} in line {}'.format( counter_name, line) )
def test_pipeline_generation(self): pipeline = beam.Pipeline() res = (pipeline | beam.Create(['a', 'b']) | beam.ExternalTransform( 'simple', None, expansion_service.ExpansionServiceServicer())) assert_that(res, equal_to(['Simple(a)', 'Simple(b)'])) proto, _ = pipeline.to_runner_api( return_context=True) pipeline_from_proto = Pipeline.from_runner_api( proto, pipeline.runner, pipeline._options) # Original pipeline has the un-expanded external transform self.assertEqual([], pipeline.transforms_stack[0].parts[1].parts) # new pipeline has the expanded external transform self.assertNotEqual( [], pipeline_from_proto.transforms_stack[0].parts[1].parts) self.assertEqual( u'ExternalTransform(simple)/TestLabel', pipeline_from_proto.transforms_stack[0].parts[1].parts[0].full_label)
def test_metrics(self): """Run a simple DoFn that increments a counter and verifies state caching metrics. Verifies that its expected value is written to a temporary file by the FileReporter""" counter_name = 'elem_counter' state_spec = userstate.BagStateSpec('state', VarIntCoder()) class DoFn(beam.DoFn): def __init__(self): self.counter = Metrics.counter(self.__class__, counter_name) _LOGGER.info('counter: %s' % self.counter.metric_name) def process(self, kv, state=beam.DoFn.StateParam(state_spec)): # Trigger materialization list(state.read()) state.add(1) self.counter.inc() options = self.create_options() # Test only supports parallelism of 1 options._all_options['parallelism'] = 1 # Create multiple bundles to test cache metrics options._all_options['max_bundle_size'] = 10 options._all_options['max_bundle_time_millis'] = 95130590130 experiments = options.view_as(DebugOptions).experiments or [] experiments.append('state_cache_size=123') options.view_as(DebugOptions).experiments = experiments with Pipeline(self.get_runner(), options) as p: # pylint: disable=expression-not-assigned (p | "create" >> beam.Create(list(range(0, 110))) | "mapper" >> beam.Map(lambda x: (x % 10, 'val')) | "stateful" >> beam.ParDo(DoFn())) lines_expected = {'counter: 110'} if streaming: lines_expected.update([ # Gauges for the last finished bundle 'stateful.beam.metric:statecache:capacity: 123', # These are off by 10 because the first bundle contains all the keys # once. Caching is only initialized after the first bundle. Caching # depends on the cache token which is lazily initialized by the # Runner's StateRequestHandlers. 'stateful.beam.metric:statecache:size: 10', 'stateful.beam.metric:statecache:get: 10', 'stateful.beam.metric:statecache:miss: 0', 'stateful.beam.metric:statecache:hit: 10', 'stateful.beam.metric:statecache:put: 0', 'stateful.beam.metric:statecache:extend: 10', 'stateful.beam.metric:statecache:evict: 0', # Counters # (total of get/hit will be off by 10 due to the caching # only getting initialized after the first bundle. # Caching depends on the cache token which is lazily # initialized by the Runner's StateRequestHandlers). 'stateful.beam.metric:statecache:get_total: 100', 'stateful.beam.metric:statecache:miss_total: 10', 'stateful.beam.metric:statecache:hit_total: 90', 'stateful.beam.metric:statecache:put_total: 10', 'stateful.beam.metric:statecache:extend_total: 100', 'stateful.beam.metric:statecache:evict_total: 0', ]) else: # Batch has a different processing model. All values for # a key are processed at once. lines_expected.update([ # Gauges 'stateful).beam.metric:statecache:capacity: 123', # For the first key, the cache token will not be set yet. # It's lazily initialized after first access in StateRequestHandlers 'stateful).beam.metric:statecache:size: 9', # We have 11 here because there are 110 / 10 elements per key 'stateful).beam.metric:statecache:get: 11', 'stateful).beam.metric:statecache:miss: 1', 'stateful).beam.metric:statecache:hit: 10', # State is flushed back once per key 'stateful).beam.metric:statecache:put: 1', 'stateful).beam.metric:statecache:extend: 1', 'stateful).beam.metric:statecache:evict: 0', # Counters 'stateful).beam.metric:statecache:get_total: 99', 'stateful).beam.metric:statecache:miss_total: 9', 'stateful).beam.metric:statecache:hit_total: 90', 'stateful).beam.metric:statecache:put_total: 9', 'stateful).beam.metric:statecache:extend_total: 9', 'stateful).beam.metric:statecache:evict_total: 0', ]) lines_actual = set() with open(self.test_metrics_path, 'r') as f: line = f.readline() while line: for metric_str in lines_expected: if metric_str in line: lines_actual.add(metric_str) line = f.readline() self.assertSetEqual(lines_actual, lines_expected)
def _create_pipeline(self, options): return Pipeline(options=options)
def run(argv=None): options = PipelineOptions(argv) options.view_as(SetupOptions).save_main_session = True with Pipeline(options=options) as p: (p | beam.Impulse() | beam.ParDo(MysqlDoFn()))