def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) window_duration = 1 * 60 # 1 minute windows. if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic): events = ( p | 'read' >> ReadFromText(known_args.input) | 'parse' >> beam.FlatMap(ParseEventFn()) | 'add_event_timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, x.timestamp))) else: events = (p | 'read' >> ReadFromPubSub(topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'decode' >> beam.ParDo(ParseEventFn())) _ = (events | 'windowed_team_score' >> WindowedTeamScore(window_duration) | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) | beam.io.WriteToBigQuery(known_args.output_tablename, known_args.output_dataset, project, SCHEMA)) p.run().wait_until_finish()
def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) window_duration = 1 * 60 # 1 minute windows. if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic): events = ( p | 'read' >> ReadFromText(known_args.input) | 'parse' >> beam.FlatMap(ParseEventFn()) | 'add_event_timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, x.timestamp))) else: # [START EXERCISE 3]: # Read game events from the Pub/Sub topic using custom timestamps, # which are in an attribute labeled 'timestamp_ms'. # Use beam.io.ReadFromPubSub to read from the topic. # https://beam.apache.org/releases/pydoc/2.8.0/apache_beam.io.gcp.pubsub.html events = (p | 'read' >> ChangeMe() | 'decode' >> beam.ParDo(ParseEventFn())) _ = (events | 'windowed_team_score' >> WindowedTeamScore(window_duration) | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum()) | beam.io.WriteToBigQuery(known_args.output_tablename, known_args.output_dataset, project, SCHEMA)) p.run().wait_until_finish()
def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic): events = (p | 'read' >> ReadFromText(known_args.input) | 'parse' >> beam.FlatMap(ParseEventFn()) | 'add_event_timestamps' >> beam.Map( lambda x: beam.window.TimestampedValue(x, x.timestamp))) else: events = (p | 'read' >> ReadFromPubSub( topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'parse' >> beam.ParDo(ParseEventFn())) # [START EXERCISE 6] _ = (events | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score)) # Extract sessions of user data, using known_args.session_gap as the # gap duration. # https://beam.apache.org/documentation/programming-guide/#provided-windowing-functions | 'sessionize' >> ChangeMe() | 'drop_scores' >> beam.CombinePerKey(lambda x: 0) | 'convert_to_activity' >> beam.ParDo(UserSessionActivity()) # Re-window into fixed windows of size user_activity_window in order # to compute the mean session duration for that window of activity. | 'window_of_sessions' >> ChangeMe() | 'session_mean' >> ChangeMe() # [END EXERCISE 6] | 'format_sessions' >> beam.ParDo(FormatSessionMeans()) | 'write_to_bigquery' >> beam.io.WriteToBigQuery( known_args.output_tablename, known_args.output_dataset, project, SESSION_SCHEMA) ) p.run().wait_until_finish()
def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic): events = (p | 'read' >> ReadFromText(known_args.input) | 'parse' >> beam.FlatMap(ParseEventFn()) | 'add_event_timestamps' >> beam.Map( lambda x: beam.window.TimestampedValue(x, x.timestamp))) else: events = (p | 'read' >> ReadFromPubSub( topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'parse' >> beam.ParDo(ParseEventFn())) _ = (events | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score)) | 'sessionize' >> beam.WindowInto( window.Sessions(float(known_args.session_gap))) | 'drop_scores' >> beam.CombinePerKey(lambda x: 0) | 'convert_to_activity' >> beam.ParDo(UserSessionActivity()) | 'window_of_sessions' >> beam.WindowInto( window.FixedWindows(int(known_args.user_activity_window))) | 'session_mean' >> beam.CombineGlobally( beam.combiners.MeanCombineFn()).without_defaults() | 'format_sessions' >> beam.ParDo(FormatSessionMeans()) | 'write_to_bigquery' >> beam.io.WriteToBigQuery( known_args.output_tablename, known_args.output_dataset, project, SESSION_SCHEMA) ) p.run().wait_until_finish()
def Run(argv=None): known_args, pipeline_args = ParseArgs(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if known_args.topic: pipeline_options.view_as(StandardOptions).streaming = True project = pipeline_options.view_as(GoogleCloudOptions).project timestamp_attribute = 'timestamp_ms' events = None if (not known_args.topic or not known_args.play_topic): logging.fatal('topic and play_topic are required.') events = (p | 'read_events' >> ReadFromPubSub( topic=known_args.topic, timestamp_attribute='timestamp_ms') | 'parse_events' >> beam.ParDo(ParseEventFn())) play_events = ( p | 'read_play_events' >> ReadFromPubSub( topic=known_args.play_topic, timestamp_attribute='timestamp_ms') | 'parse_play_events' >> beam.ParDo(ParsePlayEventFn())) sessionized_events = ( events | 'key_events_by_id' >> beam.Map(lambda x: (x.event_id, x)) | 'sessionize_events' >> beam.WindowInto( window.Sessions(float(known_args.session_gap)))) sessionized_plays = ( play_events | 'key_plays_by_id' >> beam.Map(lambda x: (x.event_id, x)) | 'sessionize_plays' >> beam.WindowInto( window.Sessions(float(known_args.session_gap)))) per_user_latency = ({ 'plays': sessionized_plays, 'events': sessionized_events } | 'cbk' >> beam.CoGroupByKey() | 'compute_latency' >> beam.ParDo(ComputeLatency())) mean_latency = ( per_user_latency | 'extract_latencies' >> beam.Values() | 'global_window' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.Repeatedly(trigger.AfterCount(1000)), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'compute_mean' >> beam.CombineGlobally(beam.combiners.MeanCombineFn( )).with_fanout(16).as_singleton_view()) _ = (per_user_latency | 'detect_bad_users' >> beam.ParDo(DetectBadUsers(), mean_latency=mean_latency) | 'filter_duplicates' >> beam.WindowInto( window.GlobalWindows(), trigger=trigger.AfterCount(1), accumulation_mode=trigger.AccumulationMode.ACCUMULATING) | 'to_bq_schema' >> beam.Map(lambda x: {'user': x}) | 'write_bad_users' >> beam.io.WriteToBigQuery( known_args.output_tablename, known_args.output_dataset, project, ('user:string'))) p.run().wait_until_finish()