Esempio n. 1
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    window_duration = 1 * 60  # 1 minute windows.
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic):
        events = (
            p
            | 'read' >> ReadFromText(known_args.input)
            | 'parse' >> beam.FlatMap(ParseEventFn())
            | 'add_event_timestamps' >>
            beam.Map(lambda x: beam.window.TimestampedValue(x, x.timestamp)))
    else:
        events = (p
                  |
                  'read' >> ReadFromPubSub(topic=known_args.topic,
                                           timestamp_attribute='timestamp_ms')
                  | 'decode' >> beam.ParDo(ParseEventFn()))

    _ = (events
         | 'windowed_team_score' >> WindowedTeamScore(window_duration)
         | 'format_team_score_sum' >> beam.ParDo(FormatTeamScoreSum())
         | beam.io.WriteToBigQuery(known_args.output_tablename,
                                   known_args.output_dataset, project, SCHEMA))
    p.run().wait_until_finish()
def run(argv=None, save_main_session=True):
    """
    run function to process cli args and run your program
    :param argv:
    :param save_main_session:
    :return:
    """

    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    job_options = pipeline_options.view_as(JobOptions)

    logging.info("-----------------------------------------------------------")
    logging.info("               Streaming with Pub/Sub emulator             ")
    logging.info("-----------------------------------------------------------")

    source = ReadFromPubSub(subscription=str(job_options.input))

    ###
    #  STREAMING BEAM: add the necessary pipeline stages along with whatever functions you require in this file
    ###
    p = beam.Pipeline(options=pipeline_options)
    lines = (p | "read" >> source | beam.Map(print))
    result = p.run()
    result.wait_until_finish()
Esempio n. 3
0
def run(pipeline_args):
    logging.basicConfig(format="%(asctime)s - %(message)s",
                        stream=sys.stdout,
                        level=logging.INFO,
                        datefmt="%Y-%m-%d %H:%M.%S")
    logging.getLogger().setLevel(logging.INFO)

    options = PipelineOptions(pipeline_args)
    user_options = options.view_as(UserOptions)
    standard_options = options.view_as(StandardOptions)
    setup_options = options.view_as(SetupOptions)

    standard_options.streaming = True
    setup_options.save_main_session = True

    logging.info("Start pipeline")

    with beam.Pipeline(options=options) as p:
        (p | 'read pub/sub topic' >> ReadFromPubSub(
            subscription=user_options.subscription.get(),
            with_attributes=False)
         | 'Parse JSON' >> beam.Map(json.loads)
         | 'Add timestamps' >>
         beam.Map(lambda x: TimestampedValue(x, x["timestamp"]))
         |
         'Keyed on key attribute' >> beam.Map(lambda x: (x["key"], x["data"]))
         | 'Setup the timer' >> beam.ParDo(TimerExample()))
Esempio n. 4
0
def run(argv=None, save_main_session=True):

    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    job_options = pipeline_options.view_as(JobOptions)

    p = beam.Pipeline(options=pipeline_options)

    schema = parse_schema(raw_schema)

    logging.info("-----------------------------------------------------------")
    logging.info("          Dataflow AVRO Streaming with Pub/Sub             ")
    logging.info("-----------------------------------------------------------")

    avroRW = avroReadWrite(schema)
    source = ReadFromPubSub(subscription=str(job_options.input))
    sink = WriteToPubSub(str(job_options.output))
    lines = (p
             | "read" >> source
             | "deserialize" >> beam.Map(lambda x: avroRW.deserialize(x))
             | "process" >> (beam.ParDo(TransformerDoFn(_schema=schema)))
             | "serialize" >> beam.Map(lambda x: avroRW.serialize(x))
             | "write" >> sink)
    result = p.run()
    result.wait_until_finish()
Esempio n. 5
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('input_topic',
                        type=str,
                        help="Input Pub/Sub topic name.")
    parser.add_argument(
        'output_table',
        type=str,
        help="Output BigQuery table name. Example: project.db.name")
    parser.add_argument('--model_project',
                        type=str,
                        help="Google Project ID with model.")
    parser.add_argument('--model_name',
                        type=str,
                        help="Name of the Google AI Platform model name.")
    parser.add_argument('--model_region',
                        type=str,
                        help="AI Platform region name.")
    parser.add_argument('--model_version',
                        type=str,
                        help="AI Platform model version.")

    known_args, pipeline_args = parser.parse_known_args(argv)

    _topic_comp = known_args.input_topic.split('/')
    if len(_topic_comp) != 4 or _topic_comp[0] != 'projects' or _topic_comp[
            2] != 'topics':
        raise ValueError("Table topic name has inappropriate format.")

    if len(known_args.output_table.split('.')) != 2:
        raise ValueError("Table name has inappropriate format.")

    inf_args = [
        known_args.model_project, known_args.model_name,
        known_args.model_region, known_args.model_version
    ]
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    p = Pipeline(options=options)
    _ = (p | 'read from pub/sub' >> ReadFromPubSub(
        known_args.input_topic).with_output_types(bytes)
         | 'windowing' >> WindowInto(window.FixedWindows(10, 0))
         | 'convert to dict' >> Map(json.loads)
         | 'pre processing' >> PreProcessing()
         | 'make inference' >> ParDo(MakeRemoteInferenceDoFn(*inf_args))
         | 'format message' >> Map(formatter)
         | 'write to BQ' >> WriteToBigQuery(
             table=known_args.output_table,
             schema=build_bq_schema(),
             create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=BigQueryDisposition.WRITE_APPEND))
    if os.environ.get('DEPLOY'):
        p.run(
        )  # I use p.run() instead of "opening context `with Pipeline() as p`" because it need to exit after running.
    else:
        p.run().wait_until_finish()
Esempio n. 6
0
def run(argv=None):
    """
    Main entry point, define and run the pipeline
    """
    parser = argparse.ArgumentParser(
        description='Run Apache Beam to process the logs')
    parser.add_argument('--input', dest='input', help='Input file to process')
    parser.add_argument('--output',
                        dest='output',
                        help='Output file to write results to')
    parser.add_argument(
        '--input_subscription',
        dest='input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    parser.add_argument(
        '--output_table',
        dest='output_table',
        help=('BigQuery Table to write results to, with the form '
              '<PROJECT>:<DATASET>.<TABLE>'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True

    print('pipeline options:', pipeline_options)

    # Specification for table in BigQuery
    table_spec = args.output_table
    table_schema = 'host:STRING, utc_timestamp:TIMESTAMP, action:STRING, uri:STRING, protocol:STRING, status:STRING, size:INTEGER'

    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        if known_args.input_subscription:
            lines = (p
                     |
                     ReadFromPubSub(subscription=known_args.input_subscription
                                    ).with_output_types(bytes))
        else:
            lines = (p
                     | ReadFromText(known_args.input,
                                    coder=coders.BytesCoder()))

        output = (lines | 'parse_filter' >> beam.ParDo(ParseAndFilterDoFn()))
        # | 'parse' >> (beam.Map(parse_one_record)))

        # output | WriteToText(known_args.output)
        output | WriteToBigQuery(
            table_spec,
            schema=table_schema,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
Esempio n. 7
0
def run(argv=None):
    """Pipeline for reading data from a PubSub topic,
    redacting the data using Cloud DLP and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='PubSub topic to read from.')
    parser.add_argument(
        '--output',
        dest='output',
        help=
        'BigQuery output dataset and table name in the format dataset.tablename'
    )
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:

        if 'streaming' in p.options.display_data():
            # Read in the CSV file
            lines = (p
                     | 'ReadFromPubSub' >> ReadFromPubSub(
                         topic=known_args.input).with_output_types(bytes)
                     | 'DecodeMessage' >> beam.Map(lambda x: x.decode('utf-8'))
                     | 'ParseMessage' >> beam.ParDo(ParsePubSubMessageFn()))
        else:
            # Read in the CSV file
            lines = (p
                     | 'ReadFromGCS' >> ReadFromText(known_args.input)
                     | 'ParseFileFn' >> beam.ParDo(ParseFileFn()))

        # Redact PII from the 'text' column.
        redacted_rows = (
            lines
            | 'IdentifyAndRedactText' >> IdentifyAndRedactText(
                p.options.display_data()['project'], ['ALL_BASIC']))

        # Format rows and write to BigQuery.
        (redacted_rows
         | 'MapToTableRows' >> beam.Map(lambda row: {
             'id': row['id'],
             'text': row['text']
         })
         | 'WriteToBigQuery' >> WriteToBigQuery(
             known_args.output,
             schema='id:INTEGER, text:STRING',
             project=p.options.display_data()['project'],
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
Esempio n. 8
0
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    job_options = pipeline_options.view_as(JobOptions)
    start = time.time()

    with beam.Pipeline(options=pipeline_options) as p:
        ten_second_combine_fn = utils.get_CalculateFeaturesPerFeatureCombineFn(
            '10')
        thirty_second_combine_fn = utils.get_CalculateFeaturesPerFeatureCombineFn(
            '30')

        schema = parse_schema(raw_schema)
        avroRW = avroReadWrite(schema)
        source = ReadFromPubSub(subscription=str(job_options.input))
        sink = WriteToPubSub(str(job_options.output))

        lines = (p
                 | "read" >> source
                 | "deserialize" >> beam.Map(lambda x: avroRW.deserialize(x))
                 | "process" >>
                 (beam.ParDo(TransformerDoFn(_schema=schema, root=start)))
                 | "key" >> beam.Map(lambda e: (e['gh6'], e)))

        fixed_windows = (lines
                         | "10 second window" >>
                         beam.WindowInto(beam.window.FixedWindows(10),
                                         trigger=AfterWatermark(),
                                         accumulation_mode=beam.transforms.
                                         trigger.AccumulationMode.DISCARDING)
                         | "Combine 10 second fixed windows" >>
                         beam.CombinePerKey(ten_second_combine_fn))

        windows = (lines
                   | beam.WindowInto(beam.window.SlidingWindows(30, 10),
                                     trigger=AfterWatermark(),
                                     accumulation_mode=beam.transforms.trigger.
                                     AccumulationMode.DISCARDING)
                   | "Combine 30 second windows" >>
                   beam.CombinePerKey(thirty_second_combine_fn))

        run_models = (fixed_windows
                      | beam.ParDo(process_with_side_input,
                                   side=beam.pvalue.AsDict(windows))
                      | beam.GroupByKey()
                      | beam.ParDo(RunModel()))

    result = p.run()
    result.wait_until_finish()
def run(argv=None, save_main_session=True):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--schema_registry',
        dest='schema_registry',
        default='http://127.0.0.1:8081',
        help='Schema registry endpoint. Defaults to local endpoint.')
    parser.add_argument('--failed-bq-inserts',
                        dest='failed_bq_inserts',
                        required=True,
                        help='Bucket for writing failed inserts')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--job_name=dbz-test-example',
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    pipeline_options.view_as(StandardOptions).streaming = True

    project_id = 'crafty-apex-264713'
    kafka_topic = 'dbserver1.inventory.customers'
    pubsub_topic = f'projects/{project_id}/topics/{kafka_topic}'

    with beam.Pipeline(options=pipeline_options) as p:
        bq = (
            p
            | 'Read from PubSub' >> ReadFromPubSub(topic=pubsub_topic)
            | '2 Second Window' >> beam.WindowInto(window.FixedWindows(2))
            | 'Avro to Row' >> beam.FlatMap(
                avro_to_row(known_args.schema_registry))
            # | 'Write to File' >>
            #       beam.io.WriteToText('args.output')
            | 'Write to BigQuery' >> WriteToBigQuery(
                'crafty-apex-264713:inventory.customers',
                schema='id:INT64,'
                'first_name:STRING,'
                'last_name:STRING,'
                'email:STRING,'
                '__op:STRING,'
                '__source_ts_ms:INT64,'
                '__lsn:INT64',
                create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
                write_disposition=BigQueryDisposition.WRITE_APPEND))

        # Can't get this to run in dataflow - causes job graph that is not updatable
        # In direct runner I can't get it to spit any errors
        """
Esempio n. 10
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic):
        events = (p
                | 'read' >> ReadFromText(known_args.input)
                | 'parse' >> beam.FlatMap(ParseEventFn())
                | 'add_event_timestamps' >> beam.Map(
                    lambda x: beam.window.TimestampedValue(x, x.timestamp)))
    else:
        events = (p
                | 'read' >> ReadFromPubSub(
                    topic=known_args.topic,
                    timestamp_attribute='timestamp_ms')
                | 'parse' >> beam.ParDo(ParseEventFn()))

    # [START EXERCISE 6]
    _ = (events
         | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score))
         # Extract sessions of user data, using known_args.session_gap as the
         # gap duration.
         # https://beam.apache.org/documentation/programming-guide/#provided-windowing-functions
         | 'sessionize' >> ChangeMe()
         | 'drop_scores' >> beam.CombinePerKey(lambda x: 0)
         | 'convert_to_activity' >> beam.ParDo(UserSessionActivity())
         # Re-window into fixed windows of size user_activity_window in order
         # to compute the mean session duration for that window of activity.
         | 'window_of_sessions' >> ChangeMe()
         | 'session_mean' >> ChangeMe()
         # [END EXERCISE 6]
         | 'format_sessions' >> beam.ParDo(FormatSessionMeans())
         | 'write_to_bigquery' >> beam.io.WriteToBigQuery(
             known_args.output_tablename, known_args.output_dataset, project,
             SESSION_SCHEMA)
         )

    p.run().wait_until_finish()
Esempio n. 11
0
def main(pipeline_options, args):

    pipe = beam.Pipeline(options=pipeline_options)

    if True:
        import google.auth
        _, project_id = google.auth.default()
        subscriber = pubsub_v1.SubscriberClient()
        subscription_path = subscriber.subscription_path(
            project_id, 'pubsub-test')
        try:
            subscriber.delete_subscription(subscription_path)
        except:
            pass

    subscription = get_subscription(INPUT_TOPIC, 'pubsub-test')

    (pipe
     | 'PubSubInflow' >> ReadFromPubSub(
         subscription=subscription,
         with_attributes=True,
         timestamp_attribute='timestamp',
     )
     | 'Inspect' >> beam.ParDo(TSInspect())
     | Log(color='cyan'))

    result = pipe.run()  # type: PipelineResult
    time.sleep(5)
    while result.state != PipelineState.RUNNING:
        time.sleep(10)

    print
    cprint('Starting streaming graph forever. Kill with ctrl+c',
           'red',
           attrs=['bold'])
    print

    send()

    try:
        result.wait_until_finish()
    except KeyboardInterrupt:
        print
        cprint('Shutting down...', 'yellow')
        result.cancel()
Esempio n. 12
0
def run(argv=None):
    """Pipeline for reading data from a PubSub topic or a Cloud Storage bucket,
    redacting the data using Cloud DLP and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='PubSub topic to read from.')
    parser.add_argument(
        '--output',
        dest='output',
        help=
        'BigQuery output dataset and table name in the format dataset.tablename'
    )
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        # Read in the CSV file
        lines = (p
                 | 'ReadFromPubSub' >> ReadFromPubSub(
                     topic=known_args.input).with_output_types(bytes)
                 | 'DecodeMessage' >> beam.Map(lambda x: x.decode('utf-8'))
                 | 'ParseMessage' >> beam.ParDo(ParsePubSubMessageFn()))

        windows = (lines
                   | 'WindowInto' >> beam.WindowInto(FixedWindows(30, 0))
                   | 'SumValues' >> beam.CombinePerKey(sum))

        # Format rows and write to BigQuery.
        (windows
         | 'ConvertToDictionary' >> beam.Map(lambda row: {
             'id': row[0],
             'total': row[1]
         })
         | 'WriteToBigQuery' >> WriteToBigQuery(
             known_args.output,
             schema='id:INTEGER, total:INTEGER',
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))
Esempio n. 13
0
def run(argv=None):
    """Pipeline for reading data from a PubSub topic or a Cloud Storage bucket,
    redacting the data using Cloud DLP and writing the results to BigQuery"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='PubSub topic to read from.')
    parser.add_argument('--output',
                        dest='output',
                        help='BigQuery output dataset and table name in the format dataset.tablename')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        lines = (p
                 # 1. Read in the file from PubSub.
                 | 'ReadFromPubSub' >> ReadFromPubSub()

                 # 2. Process the JSON message from PubSub
                 | 'ParseMessage'

                 )

        average = (lines
                   | 'ApplyWindow'
                   )
        # 3. For each Key, sum up the values
        # 4. Format the as Python dictionaries for writing to BigQuery

        (lines
         # 4. Format the as Python dictionaries for writing to BigQuery
         | 'ConvertToDictionary'
         # 5. Write the output to BigQuery
         | 'WriteToBigQuery' >> WriteToBigQuery(
                    known_args.output,
                    schema='id:INTEGER, total:INTEGER',
                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
                ))
Esempio n. 14
0
def main():
    df_options = None
    with open("config.json") as f:
        df_options = load(f)

    sink = bq_sink_creation(
        df_options.get("big-query").get("project"),
        df_options.get("big-query").get("output_table"))
    print(sink)
    print("Creating Pipeline Object...")
    pipeline_options = PipelineOptions(flag=[], **df_options["data-flow"])

    # # needed for stream #
    # pipeline_options.view_as(StandardOptions).streaming = True

    pipeline = beam.Pipeline(options=pipeline_options)
    print("executing pipeline:")
    subscription_name = "projects/{0}/subscriptions/{1}".format(
        df_options.get("PROJECT_ID"), df_options.get("PS_SUBSCRIPTION_NAME"))
    print(subscription_name)
    pipeline\
        | "Reading Pub/Sub" \
        >> ReadFromPubSub(
            subscription=subscription_name,
            id_label="id")\
        | "json conv"\
        >> beam.Map(parse_pubsub)\
        | "extract data"\
        >> beam.Map(lambda x: {
            "timeCreated": x.get("timeCreated"),
            "bucket": x.get("bucket"),
            "name": x.get("name"), })\
        | "Writing to BigQuery"\
        >> beam.io.Write(sink)

    # | "Writing to GCS"
    # >> beam.io.Write(beam.io.WriteToText(
    #     "gs://etl_accelerator/results/pubsub_data"
    # ))

    pipeline.run().wait_until_finish(duration=600000)  # 10 minutes
Esempio n. 15
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic):
        events = (p
                | 'read' >> ReadFromText(known_args.input)
                | 'parse' >> beam.FlatMap(ParseEventFn())
                | 'add_event_timestamps' >> beam.Map(
                    lambda x: beam.window.TimestampedValue(x, x.timestamp)))
    else:
        events = (p
                | 'read' >> ReadFromPubSub(
                    topic=known_args.topic,
                    timestamp_attribute='timestamp_ms')
                | 'parse' >> beam.ParDo(ParseEventFn()))

    _ = (events
         | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score))
         | 'sessionize' >> beam.WindowInto(
             window.Sessions(float(known_args.session_gap)))
         | 'drop_scores' >> beam.CombinePerKey(lambda x: 0)
         | 'convert_to_activity' >> beam.ParDo(UserSessionActivity())
         | 'window_of_sessions' >> beam.WindowInto(
             window.FixedWindows(int(known_args.user_activity_window)))
         | 'session_mean' >> beam.CombineGlobally(
             beam.combiners.MeanCombineFn()).without_defaults()
         | 'format_sessions' >> beam.ParDo(FormatSessionMeans())
         | 'write_to_bigquery' >> beam.io.WriteToBigQuery(
             known_args.output_tablename, known_args.output_dataset, project,
             SESSION_SCHEMA)
         )

    p.run().wait_until_finish()
Esempio n. 16
0
 def test(self):
   _ = (
       self.pipeline
       | 'Read from pubsub' >> ReadFromPubSub(
           subscription=self.read_sub_name,
           with_attributes=True,
           id_label='id',
       )
       | beam.Map(lambda x: bytes(1)).with_output_types(bytes)
       | 'Measure time' >> beam.ParDo(MeasureTime(self.metrics_namespace))
       | 'Window' >> beam.WindowInto(
           window.GlobalWindows(),
           trigger=trigger.Repeatedly(
               trigger.AfterCount(self.num_of_messages)),
           accumulation_mode=trigger.AccumulationMode.DISCARDING)
       | 'Count messages' >> beam.CombineGlobally(
           beam.combiners.CountCombineFn()).without_defaults().
       with_output_types(int)
       | 'Convert to bytes' >>
       beam.Map(lambda count: str(count).encode('utf-8'))
       | 'Write to Pubsub' >> beam.io.WriteToPubSub(self.matcher_topic_name))
Esempio n. 17
0
def run(argv=None):
    """This function contains the pipeline logic."""

    parser = argparse.ArgumentParser()

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--project=' + project_id,
        '--job_name=streampipeline',
        '--staging_location=gs://' + project_id + '-dataflow/staging',
        '--temp_location=gs://' + project_id + '-dataflow/temp',
        '--region=europe-west1',
        '--streaming',
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    subscription = 'projects/' + project_id + '/subscriptions/process_stream_data'
    dataset = 'my_dataset'
    table = 'stream_data'

    # General format:
    # (p | 'name of input step' >> InputClass(args)
    #    | 'name of processing step' >> ProcessClass(args)
    #    | 'name of output step' >> OutputClass(args))
    #
    # Test pipeline input
    # (p | '...' >> ReadFromPubSub(subscription=subscription)
    #    | '...' >> beam.ParDo(PrintElement()))
    #
    # Write to BigQuery
    # (p | '...' >> ReadFromPubSub(subscription=subscription)
    #    | '...' >> beam.ParDo(FormatStreamData())
    #    | '...' >> WriteToBigQuery(dataset=dataset, table=table))

    with beam.Pipeline(options=pipeline_options) as p:
        (p | 'stream_data_ingestion' >> ReadFromPubSub(subscription=subscription)
           | '...')
Esempio n. 18
0
def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic or not known_args.play_topic):
        logging.fatal('topic and play_topic are required.')

    events = (p
              | 'read_events' >> ReadFromPubSub(
                  topic=known_args.topic, timestamp_attribute='timestamp_ms')
              | 'parse_events' >> beam.ParDo(ParseEventFn()))

    play_events = (
        p
        | 'read_play_events' >> ReadFromPubSub(
            topic=known_args.play_topic, timestamp_attribute='timestamp_ms')
        | 'parse_play_events' >> beam.ParDo(ParsePlayEventFn()))

    sessionized_events = (
        events
        | 'key_events_by_id' >> beam.Map(lambda x: (x.event_id, x))
        | 'sessionize_events' >> beam.WindowInto(
            window.Sessions(float(known_args.session_gap))))

    sessionized_plays = (
        play_events
        | 'key_plays_by_id' >> beam.Map(lambda x: (x.event_id, x))
        | 'sessionize_plays' >> beam.WindowInto(
            window.Sessions(float(known_args.session_gap))))

    per_user_latency = ({
        'plays': sessionized_plays,
        'events': sessionized_events
    }
                        | 'cbk' >> beam.CoGroupByKey()
                        | 'compute_latency' >> beam.ParDo(ComputeLatency()))

    mean_latency = (
        per_user_latency
        | 'extract_latencies' >> beam.Values()
        | 'global_window' >> beam.WindowInto(
            window.GlobalWindows(),
            trigger=trigger.Repeatedly(trigger.AfterCount(1000)),
            accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
        | 'compute_mean' >> beam.CombineGlobally(beam.combiners.MeanCombineFn(
        )).with_fanout(16).as_singleton_view())

    _ = (per_user_latency
         | 'detect_bad_users' >> beam.ParDo(DetectBadUsers(),
                                            mean_latency=mean_latency)
         | 'filter_duplicates' >> beam.WindowInto(
             window.GlobalWindows(),
             trigger=trigger.AfterCount(1),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'to_bq_schema' >> beam.Map(lambda x: {'user': x})
         | 'write_bad_users' >> beam.io.WriteToBigQuery(
             known_args.output_tablename, known_args.output_dataset, project,
             ('user:string')))

    p.run().wait_until_finish()