def main(argv=None): options = PipelineOptions(argv) options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) input1 = p | 'Input1' >> beam.Create([1, 2, 3], reshuffle=False) input2 = p | 'Input2' >> beam.Create([4, 5, 6], reshuffle=False) output_a, output_b = ( (input1, input2) | 'Flatten' >> beam.Flatten() | 'Split' >> beam.ParDo(MultiOutputDoFn()).with_outputs( MultiOutputDoFn.OUTPUT_TAG_B, main=MultiOutputDoFn.OUTPUT_TAG_A)) # IdentityA and IdentityB are to set output types and set right coders for # Dataflow Runner. You may see type inference error (BEAM-4132) without them. (output_a | 'IdentityA' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int]) | 'PrintA' >> beam.ParDo(StatefulPrintDoFn('PrintA'))) (output_b | 'IdentityB' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int]) | 'PrintB' >> beam.ParDo(StatefulPrintDoFn('PrintB'))) p.run()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('input_topic', type=str, help="Input Pub/Sub topic name.") parser.add_argument( 'output_table', type=str, help="Output BigQuery table name. Example: project.db.name") parser.add_argument('--model_project', type=str, help="Google Project ID with model.") parser.add_argument('--model_name', type=str, help="Name of the Google AI Platform model name.") parser.add_argument('--model_region', type=str, help="AI Platform region name.") parser.add_argument('--model_version', type=str, help="AI Platform model version.") known_args, pipeline_args = parser.parse_known_args(argv) _topic_comp = known_args.input_topic.split('/') if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[ 2] != 'topics': raise ValueError("Table topic name has inappropriate format.") if len(known_args.output_table.split('.')) != 2: raise ValueError("Table name has inappropriate format.") inf_args = [ known_args.model_project, known_args.model_name, known_args.model_region, known_args.model_version ] options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = Pipeline(options=options) _ = (p | 'read from pub/sub' >> ReadFromPubSub( known_args.input_topic).with_output_types(bytes) | 'windowing' >> WindowInto(window.FixedWindows(10, 0)) | 'convert to dict' >> Map(json.loads) | 'pre processing' >> PreProcessing() | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args)) | 'format message' >> Map(formatter) | 'write to BQ' >> WriteToBigQuery( table=known_args.output_table, schema=build_bq_schema(), create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) if os.environ.get('DEPLOY'): p.run( ) # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running. else: p.run().wait_until_finish()
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) (p | GenerateSequence( 0, stop=100, expansion_service=BEAM_IO_EXPANSION_SERVICE) | Map(lambda x: logging.info(x))) p.run()
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) (p | Create(["a", "b", "c", "d", "e"], reshuffle=False) | Print("hello", expansion_service(options))) p.run()
def handle_return(self, pipeline: beam.Pipeline) -> None: """Appends a beam.io.WriteToParquet at the end of a beam pipeline and therefore persists the results. Args: pipeline: A beam.pipeline object. """ # TODO [ENG-139]: Implement beam writing super().handle_return(pipeline) pipeline | beam.ParDo() pipeline.run()
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) input = p | 'Input' >> beam.Create([1, 2, 3], reshuffle=False) output1 = input | 'Output1' >> beam.Map(lambda x, side: (x, side), AsList(input)) input | 'Output2' >> beam.Map( lambda x, side: logging.info('x: %s, side: %s', x, side), AsList(output1)) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True project = options.view_as(GoogleCloudOptions).project p = Pipeline(options=options) (p | Create(EN_TEXTS) | ParDo(TranslateDoFn(project, SOURCE_LANGUAGE_CODE, TARGET_LANGUAGE_CODE)) | Map(print_translation)) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) (p | Create(list(range(NUM_SHARDS))) | FlatMap(lambda _: (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD))) | WithKeys('') | ParDo(BigBagDoFn())) p.run()
def main(argv=None): options = PipelineOptions(argv) kafka_options = options.view_as(KafkaReadOptions) p = Pipeline(options=options) (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': kafka_options.bootstrap_servers }, topics=[kafka_options.topic]) | Map(lambda x: logging.info('kafka element: %s', x))) p.run()
def pipeline_options_remote(argv): """Creating a Pipeline using a PipelineOptions object for remote execution.""" from apache_beam import Pipeline from apache_beam.options.pipeline_options import PipelineOptions # [START pipeline_options_create] options = PipelineOptions(flags=argv) # [END pipeline_options_create] # [START pipeline_options_define_custom] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input') parser.add_argument('--output') # [END pipeline_options_define_custom] from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import StandardOptions # [START pipeline_options_dataflow_service] # Create and set your PipelineOptions. options = PipelineOptions(flags=argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://my-bucket/binaries' google_cloud_options.temp_location = 'gs://my-bucket/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # [END pipeline_options_dataflow_service] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output p = TestPipeline() # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) start = 1 end = 100 (p | 'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1))) | 'Sum' >> CombineGlobally(sum) | 'Print' >> ParDo(lambda total: logging.info('Sum from 1 to 100 is %s', total))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True project = options.view_as(GoogleCloudOptions).project assert project is not None, '"project" is not specified.' source_code = 'en-US' target_code = 'ja' texts = ['Hello', 'Thank you', 'Goodbye'] p = Pipeline(options=options) (p | 'Texts' >> Create(texts) | 'Translate' >> ParDo(Translate(project, source_code, target_code)) | 'Print' >> Map(lambda pair: logging.info('%s -> %s', pair[0], pair[1]))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) start = 1 end = 10 (p | 'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1))) | 'ToXml' >> ParDo(ToXmlDoFn()) # If a job finishes too quickly, worker VMs can be shutdown before they send # logs in local files to Cloud Logging. Adding 30s sleep to avoid this | 'Sleep30s' >> ParDo(Sleep(30)) | 'Print' >> ParDo(lambda xml: logging.info(xml)) ) p.run()
def main(argv=None): options = PipelineOptions(argv) topic = options.view_as(PubSubTopicOptions).topic p = Pipeline(options=options) (p # This is an external transform # `apache_beam.io.external.gcp.pubsub.ReadFromPubSub`. This is different from # `apache_beam.io.gcp.pubsub.ReadFromPubSub` which is native transform used # for most cases. # # If you set expansion_service as BeamJarExpansionService( # 'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'), it will # fail as the beam jar has no dependency for DirectRunner. As a workaround, # specify custom expansion service jar in this project. | ReadFromPubSub(topic=topic, with_attributes=True, expansion_service=expansion_service(options)) | Map(lambda message: logging.info("message: %s", message))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True opt = options.view_as(_Options) inputs = opt.inputs output_prefix = opt.output_prefix or os.path.join( options.view_as(GoogleCloudOptions).temp_location, 'output') shards = opt.shards p = Pipeline(options=options) def generate(n): yield from range(n * _ELEMENTS_PER_INPUT, (n + 1) * _ELEMENTS_PER_INPUT) (p | Create(range(inputs)) | ParDo(generate).with_output_types(int) | WriteToText(output_prefix, num_shards=shards)) p.run()
def pipeline_options_local(argv): """Creating a Pipeline using a PipelineOptions object for local execution.""" from apache_beam import Pipeline from apache_beam.options.pipeline_options import PipelineOptions options = PipelineOptions(flags=argv) # [START pipeline_options_define_custom_with_help_and_default] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='Output for the pipeline', default='gs://my-bucket/output') # [END pipeline_options_define_custom_with_help_and_default] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output # [START pipeline_options_local] # Create and set your Pipeline Options. options = PipelineOptions() p = Pipeline(options=options) # [END pipeline_options_local] p = TestPipeline() # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS) # Big batch size with 1 minute trigger | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1])))) run = p.run() run.wait_until_finish()
def tensorize_sql_fields(pipeline: Pipeline, output_path: str, sql_dataset: str, tensor_type: str): if tensor_type == 'categorical': query = _get_categorical_query(sql_dataset) elif tensor_type == 'continuous': query = _get_continuous_query(sql_dataset) elif tensor_type == 'icd': query = _get_icd_query(sql_dataset) elif tensor_type == 'disease': query = _get_disease_query(sql_dataset) elif tensor_type == 'phecode_disease': query = _get_phecode_query(sql_dataset) elif tensor_type == 'death': query = _get_death_and_censor_query(sql_dataset) else: raise ValueError( "Can tensorize only categorical or continuous fields, got ", tensor_type) bigquery_source = beam.io.BigQuerySource(query=query, use_standard_sql=True) # Query table in BQ steps = ( pipeline | 'QueryTables' >> beam.io.Read(bigquery_source) # Each row is a dictionary where the keys are the BigQuery columns | 'CreateKey' >> beam.Map(lambda row: (row['sample_id'], row)) # Group by key | 'GroupByKey' >> beam.GroupByKey() # Format into hd5 files and upload to GCS | 'CreateHd5sAndUploadToGCS' >> beam.Map(write_tensor_from_sql, output_path, tensor_type)) result = pipeline.run() result.wait_until_finish()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | WindowInto(GlobalWindows(), trigger=Repeatedly( AfterAny(AfterCount(BATCH_SIZE), AfterProcessingTime(BUFFERING_SECS))), accumulation_mode=AccumulationMode.DISCARDING) | GroupByKey() | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1] )))) run = p.run() run.wait_until_finish()
class PerformanceRuntimeTypeCheckTest(unittest.TestCase): def setUp(self): self.p = Pipeline(options=PipelineOptions( performance_runtime_type_check=True, pipeline_type_check=False)) def assertStartswith(self, msg, prefix): self.assertTrue(msg.startswith(prefix), '"%s" does not start with "%s"' % (msg, prefix)) def test_simple_input_error(self): with self.assertRaises(TypeCheckError) as e: (self.p | beam.Create([1, 1]) | beam.FlatMap(lambda x: [int(x)]).with_input_types( str).with_output_types(int)) self.p.run() self.assertIn( "Type-hint for argument: 'x' violated. " "Expected an instance of {}, " "instead found 1, an instance of {}".format(str, int), e.exception.args[0]) def test_simple_output_error(self): with self.assertRaises(TypeCheckError) as e: (self.p | beam.Create(['1', '1']) | beam.FlatMap(lambda x: [int(x)]).with_input_types( int).with_output_types(int)) self.p.run() self.assertIn( "Type-hint for argument: 'x' violated. " "Expected an instance of {}, " "instead found 1, an instance of {}.".format(int, str), e.exception.args[0]) def test_simple_input_error_with_kwarg_typehints(self): @with_input_types(element=int) @with_output_types(int) class ToInt(beam.DoFn): def process(self, element, *args, **kwargs): yield int(element) with self.assertRaises(TypeCheckError) as e: (self.p | beam.Create(['1', '1']) | beam.ParDo(ToInt())) self.p.run() self.assertStartswith( e.exception.args[0], "Runtime type violation detected within " "ParDo(ToInt): Type-hint for argument: " "'element' violated. Expected an instance of " "{}, instead found 1, " "an instance of {}.".format(int, str)) def test_do_fn_returning_non_iterable_throws_error(self): # This function is incorrect because it returns a non-iterable object def incorrect_par_do_fn(x): return x + 5 with self.assertRaises(TypeError) as cm: (self.p | beam.Create([1, 1]) | beam.FlatMap(incorrect_par_do_fn)) self.p.run() self.assertStartswith(cm.exception.args[0], "'int' object is not iterable ") def test_simple_type_satisfied(self): @with_input_types(int, int) @with_output_types(int) class AddWithNum(beam.DoFn): def process(self, element, num): return [element + num] results = (self.p | 'T' >> beam.Create([1, 2, 3]).with_output_types(int) | 'Add' >> beam.ParDo(AddWithNum(), 1)) assert_that(results, equal_to([2, 3, 4])) self.p.run() def test_simple_type_violation(self): self.p._options.view_as(TypeOptions).pipeline_type_check = False @with_output_types(str) @with_input_types(x=int) def int_to_string(x): return str(x) (self.p | 'Create' >> beam.Create(['some_string']) | 'ToStr' >> beam.Map(int_to_string)) with self.assertRaises(TypeCheckError) as e: self.p.run() self.assertStartswith( e.exception.args[0], "Runtime type violation detected within ParDo(ToStr): " "Type-hint for argument: 'x' violated. " "Expected an instance of {}, " "instead found some_string, an instance of {}.".format(int, str)) def test_pipeline_checking_satisfied_but_run_time_types_violate(self): self.p._options.view_as(TypeOptions).pipeline_type_check = False @with_output_types(Tuple[bool, int]) @with_input_types(a=int) def is_even_as_key(a): # Simulate a programming error, should be: return (a % 2 == 0, a) # However this returns Tuple[int, int] return (a % 2, a) (self.p | 'Nums' >> beam.Create(range(1)).with_output_types(int) | 'IsEven' >> beam.Map(is_even_as_key) | 'Parity' >> beam.GroupByKey()) with self.assertRaises(TypeCheckError) as e: self.p.run() self.assertStartswith( e.exception.args[0], "Runtime type violation detected within ParDo(IsEven): " "Type-hint for return type violated: " "Tuple[bool, int] hint type-constraint violated. " "The type of element #0 in the passed tuple is incorrect. " "Expected an instance of type bool, " "instead received an instance of type int. ") def test_pipeline_runtime_checking_violation_composite_type_output(self): self.p._options.view_as(TypeOptions).pipeline_type_check = False # The type-hinted applied via the 'returns()' method indicates the ParDo # should return an instance of type: Tuple[float, int]. However, an instance # of 'int' will be generated instead. with self.assertRaises(TypeCheckError) as e: (self.p | beam.Create([(1, 3.0)]) | ('Swap' >> beam.FlatMap(lambda x_y1: [x_y1[0] + x_y1[1]]). with_input_types(Tuple[int, float]).with_output_types(int))) self.p.run() self.assertStartswith( e.exception.args[0], "Runtime type violation detected within ParDo(Swap): " "Type-hint for return type violated. " "Expected an instance of {}, " "instead found 4.0, an instance of {}.".format(int, float)) def test_downstream_input_type_hint_error_has_descriptive_error_msg(self): @with_input_types(int) @with_output_types(int) class IntToInt(beam.DoFn): def process(self, element, *args, **kwargs): yield element @with_input_types(str) @with_output_types(int) class StrToInt(beam.DoFn): def process(self, element, *args, **kwargs): yield int(element) # This will raise a type check error in IntToInt even though the actual # type check error won't happen until StrToInt. The user will be told that # StrToInt's input type hints were not satisfied while running IntToInt. with self.assertRaises(TypeCheckError) as e: (self.p | beam.Create([9]) | beam.ParDo(IntToInt()) | beam.ParDo(StrToInt())) self.p.run() self.assertStartswith( e.exception.args[0], "Runtime type violation detected within ParDo(StrToInt): " "Type-hint for argument: 'element' violated. " "Expected an instance of {}, " "instead found 9, an instance of {}. " "[while running 'ParDo(IntToInt)']".format(str, int))
def run_pipeline(self, context: JobContext, pipeline: Pipeline): logger.info("Run pipeline, context %s", context) return pipeline.run()