def main(): # bq_source = BigQuerySource(query=""" # SELECT created_at, text # FROM got_sentiment.got_tweets # """, # validate=False, coder=None, # use_standard_sql=True, flatten_results=True, # kms_key=None) # Removed attributes from ReadFromPubSub: # with_attributes=False, # timestamp_attribute='created_at' # Create the Pipeline with the specified options. with Pipeline(options=options) as p: results = ( p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC) | 'Window' >> WindowInto(window.FixedWindows(60)) | 'Emit_needed_values' >> FlatMap(emit_values, entity_map) | 'Combine' >> CombinePerKey(EntityScoreCombine()) | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn()) | 'FormatForWrite' >> Map(format_for_write) | 'Write' >> WriteToBigQuery('streaming_scores', dataset=BQ_DATASET, project=PROJECT_ID, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND', batch_size=20))
def pipeline_options_local(argv): """Creating a Pipeline using a PipelineOptions object for local execution.""" from apache_beam import Pipeline from apache_beam.options.pipeline_options import PipelineOptions options = PipelineOptions(flags=argv) # [START pipeline_options_define_custom_with_help_and_default] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input', help='Input for the pipeline', default='gs://my-bucket/input') parser.add_argument('--output', help='Output for the pipeline', default='gs://my-bucket/output') # [END pipeline_options_define_custom_with_help_and_default] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output # [START pipeline_options_local] # Create and set your Pipeline Options. options = PipelineOptions() with Pipeline(options=options) as p: # [END pipeline_options_local] with TestPipeline() as p: # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output)
def main(argv=None): options = PipelineOptions(argv) options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) input1 = p | 'Input1' >> beam.Create([1, 2, 3], reshuffle=False) input2 = p | 'Input2' >> beam.Create([4, 5, 6], reshuffle=False) output_a, output_b = ( (input1, input2) | 'Flatten' >> beam.Flatten() | 'Split' >> beam.ParDo(MultiOutputDoFn()).with_outputs( MultiOutputDoFn.OUTPUT_TAG_B, main=MultiOutputDoFn.OUTPUT_TAG_A)) # IdentityA and IdentityB are to set output types and set right coders for # Dataflow Runner. You may see type inference error (BEAM-4132) without them. (output_a | 'IdentityA' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int]) | 'PrintA' >> beam.ParDo(StatefulPrintDoFn('PrintA'))) (output_b | 'IdentityB' >> beam.Map(lambda x: x).with_output_types(Tuple[str, int]) | 'PrintB' >> beam.ParDo(StatefulPrintDoFn('PrintB'))) p.run()
def run(argv=None): options = PipelineOptions(argv) options.view_as(SetupOptions).save_main_session = True with Pipeline(options=options) as p: (p | beam.Create([None]) | beam.ParDo(connect_and_query))
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('input_topic', type=str, help="Input Pub/Sub topic name.") parser.add_argument( 'output_table', type=str, help="Output BigQuery table name. Example: project.db.name") parser.add_argument('--model_project', type=str, help="Google Project ID with model.") parser.add_argument('--model_name', type=str, help="Name of the Google AI Platform model name.") parser.add_argument('--model_region', type=str, help="AI Platform region name.") parser.add_argument('--model_version', type=str, help="AI Platform model version.") known_args, pipeline_args = parser.parse_known_args(argv) _topic_comp = known_args.input_topic.split('/') if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[ 2] != 'topics': raise ValueError("Table topic name has inappropriate format.") if len(known_args.output_table.split('.')) != 2: raise ValueError("Table name has inappropriate format.") inf_args = [ known_args.model_project, known_args.model_name, known_args.model_region, known_args.model_version ] options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = Pipeline(options=options) _ = (p | 'read from pub/sub' >> ReadFromPubSub( known_args.input_topic).with_output_types(bytes) | 'windowing' >> WindowInto(window.FixedWindows(10, 0)) | 'convert to dict' >> Map(json.loads) | 'pre processing' >> PreProcessing() | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args)) | 'format message' >> Map(formatter) | 'write to BQ' >> WriteToBigQuery( table=known_args.output_table, schema=build_bq_schema(), create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) if os.environ.get('DEPLOY'): p.run( ) # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running. else: p.run().wait_until_finish()
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) (p | GenerateSequence( 0, stop=100, expansion_service=BEAM_IO_EXPANSION_SERVICE) | Map(lambda x: logging.info(x))) p.run()
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) (p | Create(["a", "b", "c", "d", "e"], reshuffle=False) | Print("hello", expansion_service(options))) p.run()
def main(argv=None): options = PipelineOptions(argv) p = Pipeline(options=options) input = p | 'Input' >> beam.Create([1, 2, 3], reshuffle=False) output1 = input | 'Output1' >> beam.Map(lambda x, side: (x, side), AsList(input)) input | 'Output2' >> beam.Map( lambda x, side: logging.info('x: %s, side: %s', x, side), AsList(output1)) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True project = options.view_as(GoogleCloudOptions).project p = Pipeline(options=options) (p | Create(EN_TEXTS) | ParDo(TranslateDoFn(project, SOURCE_LANGUAGE_CODE, TARGET_LANGUAGE_CODE)) | Map(print_translation)) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) (p | Create(list(range(NUM_SHARDS))) | FlatMap(lambda _: (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD))) | WithKeys('') | ParDo(BigBagDoFn())) p.run()
def main(argv=None): options = PipelineOptions(argv) kafka_options = options.view_as(KafkaReadOptions) p = Pipeline(options=options) (p | ReadFromKafka(consumer_config={ 'bootstrap.servers': kafka_options.bootstrap_servers }, topics=[kafka_options.topic]) | Map(lambda x: logging.info('kafka element: %s', x))) p.run()
def pipeline_options_remote(argv): """Creating a Pipeline using a PipelineOptions object for remote execution.""" from apache_beam import Pipeline from apache_beam.options.pipeline_options import PipelineOptions # [START pipeline_options_create] options = PipelineOptions(flags=argv) # [END pipeline_options_create] # [START pipeline_options_define_custom] class MyOptions(PipelineOptions): @classmethod def _add_argparse_args(cls, parser): parser.add_argument('--input') parser.add_argument('--output') # [END pipeline_options_define_custom] from apache_beam.options.pipeline_options import GoogleCloudOptions from apache_beam.options.pipeline_options import StandardOptions # [START pipeline_options_dataflow_service] # Create and set your PipelineOptions. options = PipelineOptions(flags=argv) # For Cloud execution, set the Cloud Platform project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = 'my-project-id' google_cloud_options.job_name = 'myjob' google_cloud_options.staging_location = 'gs://my-bucket/binaries' google_cloud_options.temp_location = 'gs://my-bucket/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) # [END pipeline_options_dataflow_service] my_options = options.view_as(MyOptions) my_input = my_options.input my_output = my_options.output p = TestPipeline() # Use TestPipeline for testing. lines = p | beam.io.ReadFromText(my_input) lines | beam.io.WriteToText(my_output) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) start = 1 end = 100 (p | 'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1))) | 'Sum' >> CombineGlobally(sum) | 'Print' >> ParDo(lambda total: logging.info('Sum from 1 to 100 is %s', total))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | GroupIntoBatchesWithMultiBags(BATCH_SIZE, BUFFERING_SECS) # Big batch size with 1 minute trigger | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1])))) run = p.run() run.wait_until_finish()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True project = options.view_as(GoogleCloudOptions).project assert project is not None, '"project" is not specified.' source_code = 'en-US' target_code = 'ja' texts = ['Hello', 'Thank you', 'Goodbye'] p = Pipeline(options=options) (p | 'Texts' >> Create(texts) | 'Translate' >> ParDo(Translate(project, source_code, target_code)) | 'Print' >> Map(lambda pair: logging.info('%s -> %s', pair[0], pair[1]))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) start = 1 end = 10 (p | 'From {} to {}'.format(start, end) >> Create(list(range(start, end + 1))) | 'ToXml' >> ParDo(ToXmlDoFn()) # If a job finishes too quickly, worker VMs can be shutdown before they send # logs in local files to Cloud Logging. Adding 30s sleep to avoid this | 'Sleep30s' >> ParDo(Sleep(30)) | 'Print' >> ParDo(lambda xml: logging.info(xml)) ) p.run()
def run(input_topic, num_shards, window_size): # Set `save_main_session` to True so DoFns can access globally imported modules. pipeline_options = PipelineOptions(pipeline_args, streaming=True, save_main_session=True) custom_options = pipeline_options.view_as(CustomPipelineOptions) with Pipeline(options=custom_options) as pipeline: (pipeline # Because `timestamp_attribute` is unspecified in `ReadFromPubSub`, Beam # binds the publish time returned by the Pub/Sub server for each message # to the element's timestamp parameter, accessible via `DoFn.TimestampParam`. # https://beam.apache.org/releases/pydoc/current/apache_beam.io.gcp.pubsub.html#apache_beam.io.gcp.pubsub.ReadFromPubSub | "Read from Pub/Sub" >> io.ReadFromPubSub(topic=input_topic) | "Window into" >> GroupMessagesByFixedWindows(window_size, num_shards) | "Write to GCS" >> ParDo(WriteToGCS(custom_options.output_path)))
def main(argv=None): options = PipelineOptions(argv) topic = options.view_as(PubSubTopicOptions).topic p = Pipeline(options=options) (p # This is an external transform # `apache_beam.io.external.gcp.pubsub.ReadFromPubSub`. This is different from # `apache_beam.io.gcp.pubsub.ReadFromPubSub` which is native transform used # for most cases. # # If you set expansion_service as BeamJarExpansionService( # 'sdks:java:io:google-cloud-platform:expansion-service:shadowJar'), it will # fail as the beam jar has no dependency for DirectRunner. As a workaround, # specify custom expansion service jar in this project. | ReadFromPubSub(topic=topic, with_attributes=True, expansion_service=expansion_service(options)) | Map(lambda message: logging.info("message: %s", message))) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True opt = options.view_as(_Options) inputs = opt.inputs output_prefix = opt.output_prefix or os.path.join( options.view_as(GoogleCloudOptions).temp_location, 'output') shards = opt.shards p = Pipeline(options=options) def generate(n): yield from range(n * _ELEMENTS_PER_INPUT, (n + 1) * _ELEMENTS_PER_INPUT) (p | Create(range(inputs)) | ParDo(generate).with_output_types(int) | WriteToText(output_prefix, num_shards=shards)) p.run()
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True BATCH_SIZE = 1000000 BUFFERING_SECS = 600 p = Pipeline(options=options) (p | Create(range(100), reshuffle=True) | ParDo(make_large_elements) # 128 KiB | WithKeys('') | WindowInto(GlobalWindows(), trigger=Repeatedly( AfterAny(AfterCount(BATCH_SIZE), AfterProcessingTime(BUFFERING_SECS))), accumulation_mode=AccumulationMode.DISCARDING) | GroupByKey() | Map(lambda kv: logging.info('key: %s, value count: %s', kv[0], len(kv[1] )))) run = p.run() run.wait_until_finish()
def test_metrics(self): """Run a simple DoFn that increments a counter, and verify that its expected value is written to a temporary file by the FileReporter""" counter_name = 'elem_counter' class DoFn(beam.DoFn): def __init__(self): self.counter = Metrics.counter(self.__class__, counter_name) logging.info('counter: %s' % self.counter.metric_name) def process(self, v): self.counter.inc() options = self.create_options() # Test only supports parallelism of 1 options._all_options['parallelism'] = 1 n = 100 with Pipeline(self.get_runner(), options) as p: # pylint: disable=expression-not-assigned (p | beam.Create(list(range(n))) | beam.ParDo(DoFn())) with open(self.test_metrics_path, 'r') as f: lines = [line for line in f.readlines() if counter_name in line] self.assertEqual( len(lines), 1, msg='Expected 1 line matching "{}":\n{}'.format( counter_name, '\n'.join(lines)) ) line = lines[0] self.assertTrue( '{}: {}'.format(counter_name in line, n), msg='Failed to find expected counter {} in line {}'.format( counter_name, line) )
def _create_pipeline(self, options): return Pipeline(options=options)
def run(argv=None): options = PipelineOptions(argv) options.view_as(SetupOptions).save_main_session = True with Pipeline(options=options) as p: (p | beam.Impulse() | beam.ParDo(MysqlDoFn()))
def setUp(self): self.p = Pipeline(options=PipelineOptions( performance_runtime_type_check=True, pipeline_type_check=False))
def run(): '''Entry point, it defines and runs the pipeline.''' parser = argparse.ArgumentParser() parser.add_argument( '--db-user', dest='db_user', help='Username for Postgresql instance.', required=True ) parser.add_argument( '--db-password', dest='db_password', help='Password for Postgresql instance.', required=True ) parser.add_argument( '--db-name', dest='db_name', help='Patents database name.', required=True ) parser.add_argument( '--db-host', dest='db_host', help='Hostname for Postgresql instance.', required=True ) parser.add_argument( '--db-port', dest='db_port', help='Port number for Postgresql instance.', required=True ) parser.add_argument( '--application-numbers-filepath', dest='application_numbers_filepath', help='Local or ``gs://`` path to file with application numbers.', required=True ) known_args, pipeline_args = parser.parse_known_args() logging.info('Starting onboarding pipeline (args=%s)', known_args) # We use the `save_main_session` option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with Pipeline(options=pipeline_options) as pipeline: # Read the text file[pattern] into a PCollection. lines = pipeline | 'ReadTextFile' >> ReadFromText( known_args.application_numbers_filepath ) # Transform PCollection of text lines into PCollection of one element # which is slice of application numbers. app_numbers = lines | 'CombineApplicationNumbers' >> ToList() publications = ( app_numbers | 'QueryPublications' >> ParDo(QueryPublications()) ) patents = ( publications | 'MapPublicationsToPatents' >> Map( lambda pb: { 'application_number': pb.application_number, 'application_kind': pb.application_kind, 'grant_date': datetime.datetime.fromtimestamp(pb.grant_date), }, ) ) # Transform PCollection of patents into PCollection of one element which # is slice of patents. batch = patents | 'CombinePatentsToBatch' >> ToList() result = batch | 'UpsertPatentsToDB' >> ParDo( UpsertPatentsToDB( known_args.db_user, known_args.db_password, known_args.db_name, known_args.db_host, known_args.db_port, ), ) return result
def test_metrics(self): """Run a simple DoFn that increments a counter and verifies state caching metrics. Verifies that its expected value is written to a temporary file by the FileReporter""" counter_name = 'elem_counter' state_spec = userstate.BagStateSpec('state', VarIntCoder()) class DoFn(beam.DoFn): def __init__(self): self.counter = Metrics.counter(self.__class__, counter_name) _LOGGER.info('counter: %s' % self.counter.metric_name) def process(self, kv, state=beam.DoFn.StateParam(state_spec)): # Trigger materialization list(state.read()) state.add(1) self.counter.inc() options = self.create_options() # Test only supports parallelism of 1 options._all_options['parallelism'] = 1 # Create multiple bundles to test cache metrics options._all_options['max_bundle_size'] = 10 options._all_options['max_bundle_time_millis'] = 95130590130 experiments = options.view_as(DebugOptions).experiments or [] experiments.append('state_cache_size=123') options.view_as(DebugOptions).experiments = experiments with Pipeline(self.get_runner(), options) as p: # pylint: disable=expression-not-assigned (p | "create" >> beam.Create(list(range(0, 110))) | "mapper" >> beam.Map(lambda x: (x % 10, 'val')) | "stateful" >> beam.ParDo(DoFn())) lines_expected = {'counter: 110'} if streaming: lines_expected.update([ # Gauges for the last finished bundle 'stateful.beam.metric:statecache:capacity: 123', # These are off by 10 because the first bundle contains all the keys # once. Caching is only initialized after the first bundle. Caching # depends on the cache token which is lazily initialized by the # Runner's StateRequestHandlers. 'stateful.beam.metric:statecache:size: 10', 'stateful.beam.metric:statecache:get: 10', 'stateful.beam.metric:statecache:miss: 0', 'stateful.beam.metric:statecache:hit: 10', 'stateful.beam.metric:statecache:put: 0', 'stateful.beam.metric:statecache:extend: 10', 'stateful.beam.metric:statecache:evict: 0', # Counters # (total of get/hit will be off by 10 due to the caching # only getting initialized after the first bundle. # Caching depends on the cache token which is lazily # initialized by the Runner's StateRequestHandlers). 'stateful.beam.metric:statecache:get_total: 100', 'stateful.beam.metric:statecache:miss_total: 10', 'stateful.beam.metric:statecache:hit_total: 90', 'stateful.beam.metric:statecache:put_total: 10', 'stateful.beam.metric:statecache:extend_total: 100', 'stateful.beam.metric:statecache:evict_total: 0', ]) else: # Batch has a different processing model. All values for # a key are processed at once. lines_expected.update([ # Gauges 'stateful).beam.metric:statecache:capacity: 123', # For the first key, the cache token will not be set yet. # It's lazily initialized after first access in StateRequestHandlers 'stateful).beam.metric:statecache:size: 9', # We have 11 here because there are 110 / 10 elements per key 'stateful).beam.metric:statecache:get: 11', 'stateful).beam.metric:statecache:miss: 1', 'stateful).beam.metric:statecache:hit: 10', # State is flushed back once per key 'stateful).beam.metric:statecache:put: 1', 'stateful).beam.metric:statecache:extend: 1', 'stateful).beam.metric:statecache:evict: 0', # Counters 'stateful).beam.metric:statecache:get_total: 99', 'stateful).beam.metric:statecache:miss_total: 9', 'stateful).beam.metric:statecache:hit_total: 90', 'stateful).beam.metric:statecache:put_total: 9', 'stateful).beam.metric:statecache:extend_total: 9', 'stateful).beam.metric:statecache:evict_total: 0', ]) lines_actual = set() with open(self.test_metrics_path, 'r') as f: line = f.readline() while line: for metric_str in lines_expected: if metric_str in line: lines_actual.add(metric_str) line = f.readline() self.assertSetEqual(lines_actual, lines_expected)
import apache_beam as beam from apache_beam import Pipeline from apache_beam.options.pipeline_options import PipelineOptions from pipelines.bitcoin.service.blocks import ReadBitcoinBlocks import time start_time = time.time() print("Reading Blocks") with Pipeline(options=PipelineOptions()) as p: numbers = p | 'GetBlocks' >> ReadBitcoinBlocks(100) numbers | "WriteToText" >> beam.io.textio.WriteToText("test.txt") print("Writing Blocks done. Total Time " + str(time.time() - start_time) + " seconds")
def new_pipeline(self, context: JobContext) -> Pipeline: logger.debug("Create new pipline for context %s", context) popts = self.create_pipeline_options(context) return Pipeline(options=popts)