Esempio n. 1
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    window_duration = 1 * 60  # 1 minute windows.
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic):
        events = (
            p
            | 'read' >> ReadFromText(known_args.input)
            | 'parse' >> beam.FlatMap(ParseEventFn())
            | 'add_event_timestamps' >>
            beam.Map(lambda x: beam.window.TimestampedValue(x, x.timestamp)))
    else:
        events = (p
                  |
                  'read' >> ReadFromPubSub(topic=known_args.topic,
                                           timestamp_attribute='timestamp_ms')
                  | 'decode' >> beam.ParDo(ParseEventFn()))

    _ = (events
         | 'windowed_team_score' >> WindowedTeamScore(window_duration)
         | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
         | beam.io.WriteToBigQuery(known_args.output_tablename,
                                   known_args.output_dataset, project, SCHEMA))
    p.run().wait_until_finish()
Esempio n. 2
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    window_duration = 1 * 60  # 1 minute windows.
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic):
        events = (
            p
            | 'read' >> ReadFromText(known_args.input)
            | 'parse' >> beam.FlatMap(ParseEventFn())
            | 'add_event_timestamps' >>
            beam.Map(lambda x: beam.window.TimestampedValue(x, x.timestamp)))
    else:
        # [START EXERCISE 3]:
        # Read game events from the Pub/Sub topic using custom timestamps,
        # which are in an attribute labeled 'timestamp_ms'.
        # Use beam.io.ReadFromPubSub to read from the topic.
        # https://beam.apache.org/releases/pydoc/2.8.0/apache_beam.io.gcp.pubsub.html
        events = (p
                  | 'read' >> ChangeMe()
                  | 'decode' >> beam.ParDo(ParseEventFn()))

    _ = (events
         | 'windowed_team_score' >> WindowedTeamScore(window_duration)
         | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
         | beam.io.WriteToBigQuery(known_args.output_tablename,
                                   known_args.output_dataset, project, SCHEMA))
    p.run().wait_until_finish()
Esempio n. 3
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic):
        events = (p
                | 'read' >> ReadFromText(known_args.input)
                | 'parse' >> beam.FlatMap(ParseEventFn())
                | 'add_event_timestamps' >> beam.Map(
                    lambda x: beam.window.TimestampedValue(x, x.timestamp)))
    else:
        events = (p
                | 'read' >> ReadFromPubSub(
                    topic=known_args.topic,
                    timestamp_attribute='timestamp_ms')
                | 'parse' >> beam.ParDo(ParseEventFn()))

    # [START EXERCISE 6]
    _ = (events
         | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score))
         # Extract sessions of user data, using known_args.session_gap as the
         # gap duration.
         # https://beam.apache.org/documentation/programming-guide/#provided-windowing-functions
         | 'sessionize' >> ChangeMe()
         | 'drop_scores' >> beam.CombinePerKey(lambda x: 0)
         | 'convert_to_activity' >> beam.ParDo(UserSessionActivity())
         # Re-window into fixed windows of size user_activity_window in order
         # to compute the mean session duration for that window of activity.
         | 'window_of_sessions' >> ChangeMe()
         | 'session_mean' >> ChangeMe()
         # [END EXERCISE 6]
         | 'format_sessions' >> beam.ParDo(FormatSessionMeans())
         | 'write_to_bigquery' >> beam.io.WriteToBigQuery(
             known_args.output_tablename, known_args.output_dataset, project,
             SESSION_SCHEMA)
         )

    p.run().wait_until_finish()
Esempio n. 4
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic):
        events = (p
                | 'read' >> ReadFromText(known_args.input)
                | 'parse' >> beam.FlatMap(ParseEventFn())
                | 'add_event_timestamps' >> beam.Map(
                    lambda x: beam.window.TimestampedValue(x, x.timestamp)))
    else:
        events = (p
                | 'read' >> ReadFromPubSub(
                    topic=known_args.topic,
                    timestamp_attribute='timestamp_ms')
                | 'parse' >> beam.ParDo(ParseEventFn()))

    _ = (events
         | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score))
         | 'sessionize' >> beam.WindowInto(
             window.Sessions(float(known_args.session_gap)))
         | 'drop_scores' >> beam.CombinePerKey(lambda x: 0)
         | 'convert_to_activity' >> beam.ParDo(UserSessionActivity())
         | 'window_of_sessions' >> beam.WindowInto(
             window.FixedWindows(int(known_args.user_activity_window)))
         | 'session_mean' >> beam.CombineGlobally(
             beam.combiners.MeanCombineFn()).without_defaults()
         | 'format_sessions' >> beam.ParDo(FormatSessionMeans())
         | 'write_to_bigquery' >> beam.io.WriteToBigQuery(
             known_args.output_tablename, known_args.output_dataset, project,
             SESSION_SCHEMA)
         )

    p.run().wait_until_finish()
Esempio n. 5
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic or not known_args.play_topic):
        logging.fatal('topic and play_topic are required.')

    events = (p
              | 'read_events' >> ReadFromPubSub(
                  topic=known_args.topic, timestamp_attribute='timestamp_ms')
              | 'parse_events' >> beam.ParDo(ParseEventFn()))

    play_events = (
        p
        | 'read_play_events' >> ReadFromPubSub(
            topic=known_args.play_topic, timestamp_attribute='timestamp_ms')
        | 'parse_play_events' >> beam.ParDo(ParsePlayEventFn()))

    sessionized_events = (
        events
        | 'key_events_by_id' >> beam.Map(lambda x: (x.event_id, x))
        | 'sessionize_events' >> beam.WindowInto(
            window.Sessions(float(known_args.session_gap))))

    sessionized_plays = (
        play_events
        | 'key_plays_by_id' >> beam.Map(lambda x: (x.event_id, x))
        | 'sessionize_plays' >> beam.WindowInto(
            window.Sessions(float(known_args.session_gap))))

    per_user_latency = ({
        'plays': sessionized_plays,
        'events': sessionized_events
    }
                        | 'cbk' >> beam.CoGroupByKey()
                        | 'compute_latency' >> beam.ParDo(ComputeLatency()))

    mean_latency = (
        per_user_latency
        | 'extract_latencies' >> beam.Values()
        | 'global_window' >> beam.WindowInto(
            window.GlobalWindows(),
            trigger=trigger.Repeatedly(trigger.AfterCount(1000)),
            accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
        | 'compute_mean' >> beam.CombineGlobally(beam.combiners.MeanCombineFn(
        )).with_fanout(16).as_singleton_view())

    _ = (per_user_latency
         | 'detect_bad_users' >> beam.ParDo(DetectBadUsers(),
                                            mean_latency=mean_latency)
         | 'filter_duplicates' >> beam.WindowInto(
             window.GlobalWindows(),
             trigger=trigger.AfterCount(1),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'to_bq_schema' >> beam.Map(lambda x: {'user': x})
         | 'write_bad_users' >> beam.io.WriteToBigQuery(
             known_args.output_tablename, known_args.output_dataset, project,
             ('user:string')))

    p.run().wait_until_finish()