def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) window_duration = 1 * 60 # 1 minute windows. if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic): events = ( p | 'read' >> ReadFromText(known_args.input) | 'parse' >> beam.FlatMap(ParseEventFn()) | 'add_event_timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, x.timestamp))) else: events = (p | 'read' >> ReadFromPubSub(topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'decode' >> beam.ParDo(ParseEventFn())) _ = (events | 'windowed_team_score' >> WindowedTeamScore(window_duration) | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) | beam.io.WriteToBigQuery(known_args.output_tablename, known_args.output_dataset, project, SCHEMA)) p.run().wait_until_finish()
def run(argv=None, save_main_session=True): """ run function to process cli args and run your program :param argv: :param save_main_session: :return: """ parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session job_options = pipeline_options.view_as(JobOptions) logging.info("-----------------------------------------------------------") logging.info(" Streaming with Pub/Sub emulator ") logging.info("-----------------------------------------------------------") source = ReadFromPubSub(subscription=str(job_options.input)) ### # STREAMING BEAM: add the necessary pipeline stages along with whatever functions you require in this file ### p = beam.Pipeline(options=pipeline_options) lines = (p | "read" >> source | beam.Map(print)) result = p.run() result.wait_until_finish()
def run(pipeline_args): logging.basicConfig(format="%(asctime)s - %(message)s", stream=sys.stdout, level=logging.INFO, datefmt="%Y-%m-%d %H:%M.%S") logging.getLogger().setLevel(logging.INFO) options = PipelineOptions(pipeline_args) user_options = options.view_as(UserOptions) standard_options = options.view_as(StandardOptions) setup_options = options.view_as(SetupOptions) standard_options.streaming = True setup_options.save_main_session = True logging.info("Start pipeline") with beam.Pipeline(options=options) as p: (p | 'read pub/sub topic' >> ReadFromPubSub( subscription=user_options.subscription.get(), with_attributes=False) | 'Parse JSON' >> beam.Map(json.loads) | 'Add timestamps' >> beam.Map(lambda x: TimestampedValue(x, x["timestamp"])) | 'Keyed on key attribute' >> beam.Map(lambda x: (x["key"], x["data"])) | 'Setup the timer' >> beam.ParDo(TimerExample()))
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session job_options = pipeline_options.view_as(JobOptions) p = beam.Pipeline(options=pipeline_options) schema = parse_schema(raw_schema) logging.info("-----------------------------------------------------------") logging.info(" Dataflow AVRO Streaming with Pub/Sub ") logging.info("-----------------------------------------------------------") avroRW = avroReadWrite(schema) source = ReadFromPubSub(subscription=str(job_options.input)) sink = WriteToPubSub(str(job_options.output)) lines = (p | "read" >> source | "deserialize" >> beam.Map(lambda x: avroRW.deserialize(x)) | "process" >> (beam.ParDo(TransformerDoFn(_schema=schema))) | "serialize" >> beam.Map(lambda x: avroRW.serialize(x)) | "write" >> sink) result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('input_topic', type=str, help="Input Pub/Sub topic name.") parser.add_argument( 'output_table', type=str, help="Output BigQuery table name. Example: project.db.name") parser.add_argument('--model_project', type=str, help="Google Project ID with model.") parser.add_argument('--model_name', type=str, help="Name of the Google AI Platform model name.") parser.add_argument('--model_region', type=str, help="AI Platform region name.") parser.add_argument('--model_version', type=str, help="AI Platform model version.") known_args, pipeline_args = parser.parse_known_args(argv) _topic_comp = known_args.input_topic.split('/') if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[ 2] != 'topics': raise ValueError("Table topic name has inappropriate format.") if len(known_args.output_table.split('.')) != 2: raise ValueError("Table name has inappropriate format.") inf_args = [ known_args.model_project, known_args.model_name, known_args.model_region, known_args.model_version ] options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True options.view_as(StandardOptions).streaming = True p = Pipeline(options=options) _ = (p | 'read from pub/sub' >> ReadFromPubSub( known_args.input_topic).with_output_types(bytes) | 'windowing' >> WindowInto(window.FixedWindows(10, 0)) | 'convert to dict' >> Map(json.loads) | 'pre processing' >> PreProcessing() | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args)) | 'format message' >> Map(formatter) | 'write to BQ' >> WriteToBigQuery( table=known_args.output_table, schema=build_bq_schema(), create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) if os.environ.get('DEPLOY'): p.run( ) # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running. else: p.run().wait_until_finish()
def run(argv=None): """ Main entry point, define and run the pipeline """ parser = argparse.ArgumentParser( description='Run Apache Beam to process the logs') parser.add_argument('--input', dest='input', help='Input file to process') parser.add_argument('--output', dest='output', help='Output file to write results to') parser.add_argument( '--input_subscription', dest='input_subscription', help=('Input PubSub subscription of the form ' '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."')) parser.add_argument( '--output_table', dest='output_table', help=('BigQuery Table to write results to, with the form ' '<PROJECT>:<DATASET>.<TABLE>')) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True pipeline_options.view_as(StandardOptions).streaming = True print('pipeline options:', pipeline_options) # Specification for table in BigQuery table_spec = args.output_table table_schema = 'host:STRING, utc_timestamp:TIMESTAMP, action:STRING, uri:STRING, protocol:STRING, status:STRING, size:INTEGER' with beam.Pipeline(options=pipeline_options) as p: # Read the text file[pattern] into a PCollection. if known_args.input_subscription: lines = (p | ReadFromPubSub(subscription=known_args.input_subscription ).with_output_types(bytes)) else: lines = (p | ReadFromText(known_args.input, coder=coders.BytesCoder())) output = (lines | 'parse_filter' >> beam.ParDo(ParseAndFilterDoFn())) # | 'parse' >> (beam.Map(parse_one_record))) # output | WriteToText(known_args.output) output | WriteToBigQuery( table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
def run(argv=None): """Pipeline for reading data from a PubSub topic, redacting the data using Cloud DLP and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='PubSub topic to read from.') parser.add_argument( '--output', dest='output', help= 'BigQuery output dataset and table name in the format dataset.tablename' ) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: if 'streaming' in p.options.display_data(): # Read in the CSV file lines = (p | 'ReadFromPubSub' >> ReadFromPubSub( topic=known_args.input).with_output_types(bytes) | 'DecodeMessage' >> beam.Map(lambda x: x.decode('utf-8')) | 'ParseMessage' >> beam.ParDo(ParsePubSubMessageFn())) else: # Read in the CSV file lines = (p | 'ReadFromGCS' >> ReadFromText(known_args.input) | 'ParseFileFn' >> beam.ParDo(ParseFileFn())) # Redact PII from the 'text' column. redacted_rows = ( lines | 'IdentifyAndRedactText' >> IdentifyAndRedactText( p.options.display_data()['project'], ['ALL_BASIC'])) # Format rows and write to BigQuery. (redacted_rows | 'MapToTableRows' >> beam.Map(lambda row: { 'id': row['id'], 'text': row['text'] }) | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema='id:INTEGER, text:STRING', project=p.options.display_data()['project'], create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session job_options = pipeline_options.view_as(JobOptions) start = time.time() with beam.Pipeline(options=pipeline_options) as p: ten_second_combine_fn = utils.get_CalculateFeaturesPerFeatureCombineFn( '10') thirty_second_combine_fn = utils.get_CalculateFeaturesPerFeatureCombineFn( '30') schema = parse_schema(raw_schema) avroRW = avroReadWrite(schema) source = ReadFromPubSub(subscription=str(job_options.input)) sink = WriteToPubSub(str(job_options.output)) lines = (p | "read" >> source | "deserialize" >> beam.Map(lambda x: avroRW.deserialize(x)) | "process" >> (beam.ParDo(TransformerDoFn(_schema=schema, root=start))) | "key" >> beam.Map(lambda e: (e['gh6'], e))) fixed_windows = (lines | "10 second window" >> beam.WindowInto(beam.window.FixedWindows(10), trigger=AfterWatermark(), accumulation_mode=beam.transforms. trigger.AccumulationMode.DISCARDING) | "Combine 10 second fixed windows" >> beam.CombinePerKey(ten_second_combine_fn)) windows = (lines | beam.WindowInto(beam.window.SlidingWindows(30, 10), trigger=AfterWatermark(), accumulation_mode=beam.transforms.trigger. AccumulationMode.DISCARDING) | "Combine 30 second windows" >> beam.CombinePerKey(thirty_second_combine_fn)) run_models = (fixed_windows | beam.ParDo(process_with_side_input, side=beam.pvalue.AsDict(windows)) | beam.GroupByKey() | beam.ParDo(RunModel())) result = p.run() result.wait_until_finish()
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument( '--schema_registry', dest='schema_registry', default='http://127.0.0.1:8081', help='Schema registry endpoint. Defaults to local endpoint.') parser.add_argument('--failed-bq-inserts', dest='failed_bq_inserts', required=True, help='Bucket for writing failed inserts') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--job_name=dbz-test-example', ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session pipeline_options.view_as(StandardOptions).streaming = True project_id = 'crafty-apex-264713' kafka_topic = 'dbserver1.inventory.customers' pubsub_topic = f'projects/{project_id}/topics/{kafka_topic}' with beam.Pipeline(options=pipeline_options) as p: bq = ( p | 'Read from PubSub' >> ReadFromPubSub(topic=pubsub_topic) | '2 Second Window' >> beam.WindowInto(window.FixedWindows(2)) | 'Avro to Row' >> beam.FlatMap( avro_to_row(known_args.schema_registry)) # | 'Write to File' >> # beam.io.WriteToText('args.output') | 'Write to BigQuery' >> WriteToBigQuery( 'crafty-apex-264713:inventory.customers', schema='id:INT64,' 'first_name:STRING,' 'last_name:STRING,' 'email:STRING,' '__op:STRING,' '__source_ts_ms:INT64,' '__lsn:INT64', create_disposition=BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=BigQueryDisposition.WRITE_APPEND)) # Can't get this to run in dataflow - causes job graph that is not updatable # In direct runner I can't get it to spit any errors """
def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic): events = (p | 'read' >> ReadFromText(known_args.input) | 'parse' >> beam.FlatMap(ParseEventFn()) | 'add_event_timestamps' >> beam.Map( lambda x: beam.window.TimestampedValue(x, x.timestamp))) else: events = (p | 'read' >> ReadFromPubSub( topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'parse' >> beam.ParDo(ParseEventFn())) # [START EXERCISE 6] _ = (events | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score)) # Extract sessions of user data, using known_args.session_gap as the # gap duration. # https://beam.apache.org/documentation/programming-guide/#provided-windowing-functions | 'sessionize' >> ChangeMe() | 'drop_scores' >> beam.CombinePerKey(lambda x: 0) | 'convert_to_activity' >> beam.ParDo(UserSessionActivity()) # Re-window into fixed windows of size user_activity_window in order # to compute the mean session duration for that window of activity. | 'window_of_sessions' >> ChangeMe() | 'session_mean' >> ChangeMe() # [END EXERCISE 6] | 'format_sessions' >> beam.ParDo(FormatSessionMeans()) | 'write_to_bigquery' >> beam.io.WriteToBigQuery( known_args.output_tablename, known_args.output_dataset, project, SESSION_SCHEMA) ) p.run().wait_until_finish()
def main(pipeline_options, args): pipe = beam.Pipeline(options=pipeline_options) if True: import google.auth _, project_id = google.auth.default() subscriber = pubsub_v1.SubscriberClient() subscription_path = subscriber.subscription_path( project_id, 'pubsub-test') try: subscriber.delete_subscription(subscription_path) except: pass subscription = get_subscription(INPUT_TOPIC, 'pubsub-test') (pipe | 'PubSubInflow' >> ReadFromPubSub( subscription=subscription, with_attributes=True, timestamp_attribute='timestamp', ) | 'Inspect' >> beam.ParDo(TSInspect()) | Log(color='cyan')) result = pipe.run() # type: PipelineResult time.sleep(5) while result.state != PipelineState.RUNNING: time.sleep(10) print cprint('Starting streaming graph forever. Kill with ctrl+c', 'red', attrs=['bold']) print send() try: result.wait_until_finish() except KeyboardInterrupt: print cprint('Shutting down...', 'yellow') result.cancel()
def run(argv=None): """Pipeline for reading data from a PubSub topic or a Cloud Storage bucket, redacting the data using Cloud DLP and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='PubSub topic to read from.') parser.add_argument( '--output', dest='output', help= 'BigQuery output dataset and table name in the format dataset.tablename' ) known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # Read in the CSV file lines = (p | 'ReadFromPubSub' >> ReadFromPubSub( topic=known_args.input).with_output_types(bytes) | 'DecodeMessage' >> beam.Map(lambda x: x.decode('utf-8')) | 'ParseMessage' >> beam.ParDo(ParsePubSubMessageFn())) windows = (lines | 'WindowInto' >> beam.WindowInto(FixedWindows(30, 0)) | 'SumValues' >> beam.CombinePerKey(sum)) # Format rows and write to BigQuery. (windows | 'ConvertToDictionary' >> beam.Map(lambda row: { 'id': row[0], 'total': row[1] }) | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema='id:INTEGER, total:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
def run(argv=None): """Pipeline for reading data from a PubSub topic or a Cloud Storage bucket, redacting the data using Cloud DLP and writing the results to BigQuery""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', help='PubSub topic to read from.') parser.add_argument('--output', dest='output', help='BigQuery output dataset and table name in the format dataset.tablename') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: lines = (p # 1. Read in the file from PubSub. | 'ReadFromPubSub' >> ReadFromPubSub() # 2. Process the JSON message from PubSub | 'ParseMessage' ) average = (lines | 'ApplyWindow' ) # 3. For each Key, sum up the values # 4. Format the as Python dictionaries for writing to BigQuery (lines # 4. Format the as Python dictionaries for writing to BigQuery | 'ConvertToDictionary' # 5. Write the output to BigQuery | 'WriteToBigQuery' >> WriteToBigQuery( known_args.output, schema='id:INTEGER, total:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND ))
def main(): df_options = None with open("config.json") as f: df_options = load(f) sink = bq_sink_creation( df_options.get("big-query").get("project"), df_options.get("big-query").get("output_table")) print(sink) print("Creating Pipeline Object...") pipeline_options = PipelineOptions(flag=[], **df_options["data-flow"]) # # needed for stream # # pipeline_options.view_as(StandardOptions).streaming = True pipeline = beam.Pipeline(options=pipeline_options) print("executing pipeline:") subscription_name = "projects/{0}/subscriptions/{1}".format( df_options.get("PROJECT_ID"), df_options.get("PS_SUBSCRIPTION_NAME")) print(subscription_name) pipeline\ | "Reading Pub/Sub" \ >> ReadFromPubSub( subscription=subscription_name, id_label="id")\ | "json conv"\ >> beam.Map(parse_pubsub)\ | "extract data"\ >> beam.Map(lambda x: { "timeCreated": x.get("timeCreated"), "bucket": x.get("bucket"), "name": x.get("name"), })\ | "Writing to BigQuery"\ >> beam.io.Write(sink) # | "Writing to GCS" # >> beam.io.Write(beam.io.WriteToText( # "gs://etl_accelerator/results/pubsub_data" # )) pipeline.run().wait_until_finish(duration=600000) # 10 minutes
def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic): events = (p | 'read' >> ReadFromText(known_args.input) | 'parse' >> beam.FlatMap(ParseEventFn()) | 'add_event_timestamps' >> beam.Map( lambda x: beam.window.TimestampedValue(x, x.timestamp))) else: events = (p | 'read' >> ReadFromPubSub( topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'parse' >> beam.ParDo(ParseEventFn())) _ = (events | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score)) | 'sessionize' >> beam.WindowInto( window.Sessions(float(known_args.session_gap))) | 'drop_scores' >> beam.CombinePerKey(lambda x: 0) | 'convert_to_activity' >> beam.ParDo(UserSessionActivity()) | 'window_of_sessions' >> beam.WindowInto( window.FixedWindows(int(known_args.user_activity_window))) | 'session_mean' >> beam.CombineGlobally( beam.combiners.MeanCombineFn()).without_defaults() | 'format_sessions' >> beam.ParDo(FormatSessionMeans()) | 'write_to_bigquery' >> beam.io.WriteToBigQuery( known_args.output_tablename, known_args.output_dataset, project, SESSION_SCHEMA) ) p.run().wait_until_finish()
def test(self): _ = ( self.pipeline | 'Read from pubsub' >> ReadFromPubSub( subscription=self.read_sub_name, with_attributes=True, id_label='id', ) | beam.Map(lambda x: bytes(1)).with_output_types(bytes) | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace)) | 'Window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly( trigger.AfterCount(self.num_of_messages)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'Count messages' >> beam.CombineGlobally( beam.combiners.CountCombineFn()).without_defaults(). with_output_types(int) | 'Convert to bytes' >> beam.Map(lambda count: str(count).encode('utf-8')) | 'Write to Pubsub' >> beam.io.WriteToPubSub(self.matcher_topic_name))
def run(argv=None): """This function contains the pipeline logic.""" parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--project=' + project_id, '--job_name=streampipeline', '--staging_location=gs://' + project_id + '-dataflow/staging', '--temp_location=gs://' + project_id + '-dataflow/temp', '--region=europe-west1', '--streaming', ]) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True subscription = 'projects/' + project_id + '/subscriptions/process_stream_data' dataset = 'my_dataset' table = 'stream_data' # General format: # (p | 'name of input step' >> InputClass(args) # | 'name of processing step' >> ProcessClass(args) # | 'name of output step' >> OutputClass(args)) # # Test pipeline input # (p | '...' >> ReadFromPubSub(subscription=subscription) # | '...' >> beam.ParDo(PrintElement())) # # Write to BigQuery # (p | '...' >> ReadFromPubSub(subscription=subscription) # | '...' >> beam.ParDo(FormatStreamData()) # | '...' >> WriteToBigQuery(dataset=dataset, table=table)) with beam.Pipeline(options=pipeline_options) as p: (p | 'stream_data_ingestion' >> ReadFromPubSub(subscription=subscription) | '...')
def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic or not known_args.play_topic): logging.fatal('topic and play_topic are required.') events = (p | 'read_events' >> ReadFromPubSub( topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'parse_events' >> beam.ParDo(ParseEventFn())) play_events = ( p | 'read_play_events' >> ReadFromPubSub( topic=known_args.play_topic, timestamp_attribute='timestamp_ms') | 'parse_play_events' >> beam.ParDo(ParsePlayEventFn())) sessionized_events = ( events | 'key_events_by_id' >> beam.Map(lambda x: (x.event_id, x)) | 'sessionize_events' >> beam.WindowInto( window.Sessions(float(known_args.session_gap)))) sessionized_plays = ( play_events | 'key_plays_by_id' >> beam.Map(lambda x: (x.event_id, x)) | 'sessionize_plays' >> beam.WindowInto( window.Sessions(float(known_args.session_gap)))) per_user_latency = ({ 'plays': sessionized_plays, 'events': sessionized_events } | 'cbk' >> beam.CoGroupByKey() | 'compute_latency' >> beam.ParDo(ComputeLatency())) mean_latency = ( per_user_latency | 'extract_latencies' >> beam.Values() | 'global_window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1000)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'compute_mean' >> beam.CombineGlobally(beam.combiners.MeanCombineFn( )).with_fanout(16).as_singleton_view()) _ = (per_user_latency | 'detect_bad_users' >> beam.ParDo(DetectBadUsers(), mean_latency=mean_latency) | 'filter_duplicates' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.AfterCount(1), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'to_bq_schema' >> beam.Map(lambda x: {'user': x}) | 'write_bad_users' >> beam.io.WriteToBigQuery( known_args.output_tablename, known_args.output_dataset, project, ('user:string'))) p.run().wait_until_finish()