Python Sessions Exemples, apache_beam.transforms.window.Sessions Python Exemples

Exemple #1

0

Afficher le fichier

def run(argv=None):
    '''
    Processa
    os dados
    '''

    p = beam.Pipeline(options=PipelineOptions())

    # 10 minutos
    sessao = 60*10

    with beam.Pipeline(options=PipelineOptions()) as p:

        carrinho = (p
            | 'Lê Arquivo' >> ReadFromText('input/page-views.json')
            | 'Transforma entrada' >> beam.ParDo(parse_json)
            | 'Adiciona Timestamp' >> beam.Map(
                lambda x: beam.window.TimestampedValue(
                x,datetime.datetime.strptime(
                    x['timestamp'],  
                    '%Y-%m-%d %H:%M:%S').timestamp()))
            | 'Adciona chave' >> beam.Map(lambda x: (x['customer'], x))
            | 'Define Sessão'>> beam.WindowInto(
                window.Sessions(sessao),
                timestamp_combiner=window.TimestampCombiner.OUTPUT_AT_EOW)
            | 'Group By Key' >> beam.GroupByKey()
            | 'Verifica Abandono' >> beam.ParDo(Abandono())
            | 'Salva Saida' >> beam.ParDo(salva_arquivo)
            )

Exemple #2

0

Afficher le fichier

def run(argv=None):
    parser = argparse.ArgumentParser()
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    session_gap = 5  # [s]; 5 seconds
    user1_data = [{
        'user_id': 'Thanos',
        'value': 'event_{}'.format(event),
        'timestamp': time.time() + 2**event
    } for event in range(5)]
    user2_data = [{
        'user_id': 'Groot',
        'value': 'event_{}'.format(event),
        'timestamp': time.time() + 1 + 3**event
    } for event in range(3)]

    events = (p
      | 'Create Events' >> beam.Create(user1_data + user2_data) \
      | 'Add Timestamps' >> beam.Map(lambda x: beam.window.TimestampedValue(x, x['timestamp'])) \
      | 'keyed_on_user_id'      >> beam.Map(lambda x: (x['user_id'], x))
      | 'user_session_window'   >> beam.WindowInto(window.Sessions(session_gap),
                                                 timestamp_combiner=window.TimestampCombiner.OUTPUT_AT_EOW) \
      | 'Group' >> beam.GroupByKey() \
      | 'analyze_session'         >> beam.ParDo(AnalyzeSession()))

    result = p.run()
    result.wait_until_finish()

Exemple #3

0

Afficher le fichier

Fichier : snippets_test.py Projet : yktq/Apache-Beam-

 def test_setting_session_windows(self):
   with TestPipeline() as p:
     unkeyed_items = p | beam.Create([2, 11, 16, 27])
     items = (unkeyed_items
              | 'key' >> beam.Map(
                  lambda x: beam.window.TimestampedValue(('k', x), x * 60)))
     # [START setting_session_windows]
     from apache_beam import window
     session_windowed_items = (
         items | 'window' >> beam.WindowInto(window.Sessions(10 * 60)))
     # [END setting_session_windows]
     summed = (session_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     assert_that(unkeyed, equal_to([29, 27]))

Exemple #4

0

Afficher le fichier

Fichier : query11.py Projet : yinghsienwu/beam

def load(events, metadata=None, pipeline_options=None):

    return (events
            # filter to get only bids and then extract bidder id
            | nexmark_query_util.JustBids()
            | 'query11_extract_bidder' >> beam.Map(lambda bid: bid.bidder)
            # window auction and key by auctions' seller
            | 'query11_session_window' >> beam.WindowInto(
                window.Sessions(metadata.get('window_size_sec')),
                trigger=trigger.AfterWatermark(
                    early=trigger.AfterCount(metadata.get('max_log_events'))),
                accumulation_mode=trigger.AccumulationMode.DISCARDING,
                allowed_lateness=metadata.get('occasional_delay_sec') // 2)
            # count per bidder
            | beam.combiners.Count.PerElement()
            | beam.Map(
                lambda bidder_count: {
                    ResultNames.BIDDER_ID: bidder_count[0],
                    ResultNames.BID_COUNT: bidder_count[1]
                }))

Exemple #5

0

Afficher le fichier

def startSentinelToEVIdataflow(PROJECT_ID,TEST_NAME,BUCKET):
    # Arguments Used by Dataflow
    argv = [
        '--project={}'.format(PROJECT_ID)
        , '--job_name={}'.format(TEST_NAME)
        , '--save_main_session'
        , '--staging_location=gs://{}/staging/'.format(BUCKET)
        , '--temp_location=gs://{}/tmp/'.format(BUCKET)
        , '--runner=DataflowRunner'
        , '--streaming'
        ]

    # Reading from and Writing to a PubSub Topic in Dataflow

    p = beam.Pipeline(argv=argv)
    (p
     | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(topic=None
                                                  , subscription=full_subscription_name
                                                  , id_label=None
                                                  , timestamp_attribute=None
                                                  ).with_output_types(bytes)
     | 'Transformations' >> beam.Map(lambda line: Transform(line))
     | 'WriteStringsToPubSub-EVIReceiver' >> beam.io.WriteToPubSub(topic=full_receiver_topic_name)
     )

    # Windowning in Dataflow

    p = beam.Pipeline(argv=argv)
    K = beam.typehints.TypeVariable('K')
    V = beam.typehints.TypeVariable('V')
    (p
     | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(topic=None
                                                  , subscription=full_subscription_name
                                                  , id_label=None
                                                  , timestamp_attribute=None
                                                  ).with_output_types(bytes)
     | 'Encode' >> beam.Map(lambda line: line.encode("utf-8"))
     # | 'Window' >> beam.WindowInto(window.FixedWindows(5.0))
     | 'Window' >> beam.WindowInto(window.Sessions(60.0))
     | 'GroupBy' >> beam.GroupByKey().with_output_types(beam.typehints.Tuple[K, V])
     )

Exemple #6

0

Afficher le fichier

def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic):
        events = (p
                | 'read' >> ReadFromText(known_args.input)
                | 'parse' >> beam.FlatMap(ParseEventFn())
                | 'add_event_timestamps' >> beam.Map(
                    lambda x: beam.window.TimestampedValue(x, x.timestamp)))
    else:
        events = (p
                | 'read' >> ReadFromPubSub(
                    topic=known_args.topic,
                    timestamp_attribute='timestamp_ms')
                | 'parse' >> beam.ParDo(ParseEventFn()))

    _ = (events
         | 'extract_user_score' >> beam.Map(lambda x: (x.user, x.score))
         | 'sessionize' >> beam.WindowInto(
             window.Sessions(float(known_args.session_gap)))
         | 'drop_scores' >> beam.CombinePerKey(lambda x: 0)
         | 'convert_to_activity' >> beam.ParDo(UserSessionActivity())
         | 'window_of_sessions' >> beam.WindowInto(
             window.FixedWindows(int(known_args.user_activity_window)))
         | 'session_mean' >> beam.CombineGlobally(
             beam.combiners.MeanCombineFn()).without_defaults()
         | 'format_sessions' >> beam.ParDo(FormatSessionMeans())
         | 'write_to_bigquery' >> beam.io.WriteToBigQuery(
             known_args.output_tablename, known_args.output_dataset, project,
             SESSION_SCHEMA)
         )

    p.run().wait_until_finish()

Exemple #7

0

Afficher le fichier

Fichier : combiners_test.py Projet : yaoshi1994/beam

    def test_sessions_combine(self):
        with TestPipeline() as p:
            input = (
                p
                | beam.Create([('c', 1), ('c', 9), ('c', 12), ('d', 2),
                               ('d', 4)])
                |
                beam.MapTuple(lambda k, v: window.TimestampedValue((k, v), v))
                | beam.WindowInto(window.Sessions(4)))

            global_sum = (input
                          | beam.Values()
                          | beam.CombineGlobally(sum).without_defaults())
            sum_per_key = input | beam.CombinePerKey(sum)

            # The first window has 3 elements: ('c', 1), ('d', 2), ('d', 4).
            # The second window has 2 elements: ('c', 9), ('c', 12).
            assert_that(global_sum, equal_to([7, 21]), label='global sum')
            assert_that(sum_per_key,
                        equal_to([('c', 1), ('c', 21), ('d', 6)]),
                        label='sum per key')

Exemple #8

0

Afficher le fichier

def run(argv=None, save_main_session=True):

    # 実行時のコマンドで受け付けるオプションの設定
    parser = argparse.ArgumentParser()
    # 入力のサブスクリプション受付のためのオプション
    parser.add_argument(
        '--input_subscription',
        required=True,
        help=('Input PubSub subscription '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>".'))
    # 出力のBigQueryデータセット受付のためのオプション
    parser.add_argument('--output_dataset',
                        required=True,
                        help=('Output BigQuery dataset '
                              '"<PROJECT>.<DATASET>"'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # パイプラインに渡すオプションインスタンスを生成します。
    # streaming=True でストリーミングジョブを有効にするオプションを明示的に渡します。
    pipeline_options = PipelineOptions(pipeline_args, streaming=True)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    # パイプラインの生成
    with beam.Pipeline(options=pipeline_options) as p:

        subscription = known_args.input_subscription
        (bigquery_project, dataset) = known_args.output_dataset.split('.')

        rides = (
            p
            # ReadFromPubSub()で指定されたPub/Sub スブスクリプションからメッセージを取得
            | 'Read From PubSub' >>
            ReadFromPubSub(subscription=subscription).with_output_types(bytes)
            # メッセージ文字列をPythonのディクショナリに変換します。
            | 'ToDict' >> beam.Map(json.loads))

        # PCollectionの要素が乗車および降車のデータのみ返却する関数
        def is_pickup_or_dropoff(element):
            return element['ride_status'] in ('pickup', 'dropoff')

        rides_onoff = (
            rides
            # 乗降車データのみ抽出。走行中 enroute データを除外
            | 'Filter pickup/dropoff' >> beam.Filter(is_pickup_or_dropoff))

        rides_onoff_1m = (
            rides_onoff
            # タンブリングウィンドウ生成
            | 'Into 1m FixedWindow' >> beam.WindowInto(window.FixedWindows(60))
            # 乗車ステータスごとに件数を集計
            | 'Group status by rides' >> beam.Map(lambda x:
                                                  (x['ride_status'], 1))
            | 'Count unique elements' >> beam.combiners.Count.PerKey()
            # ウィンドウの開始イベント時刻をデータに付与
            | 'Attach window start timestamp' >> beam.ParDo(
                AttachWindowTimestamp()))

        # WriteToBigQueryを使って、BigQueryへストリーミング挿入で結果を出力
        rides_onoff_1m | 'Write 1m rides to BigQuery' >> WriteToBigQuery(
            'rides_1m',
            dataset=dataset,
            project=bigquery_project,
            create_disposition=BigQueryDisposition.CREATE_NEVER)

        trips_od = (
            # 乗降車に絞ったデータPCollection
            rides_onoff
            # セッションウィンドウで利用するためのセッションIDとなるride_idをキーに設定
            | 'Key-value pair with Ride_id' >> beam.Map(lambda x:
                                                        (x['ride_id'], x))
            # セッションウィンドウ設定。ギャップ期間を5分に設定。
            # もし同じ乗車データの位置情報が5分より大きな間隔をあけて到着した場合、
            # 別のセッションとして集計される
            | 'Into SessionWindows' >> beam.WindowInto(window.Sessions(5 * 60))
            | 'Group by ride_id' >> beam.GroupByKey()
            # セッション内でまとめた乗車および降車データを一つの要素に結合する
            # 処理は、CompileTripODクラスで実装
            | 'Compile trip OD' >> beam.ParDo(CompileTripOD()))

        trips_od | 'Write od trips to BigQuery' >> WriteToBigQuery(
            'trips_od',
            dataset=dataset,
            project=bigquery_project,
            create_disposition=BigQueryDisposition.CREATE_NEVER)

Exemple #9

0

Afficher le fichier

Fichier : __main__.py Projet : ayyoubmarroun/dataflow_streaming_demo

                |
                "Decode and format json" >> beam.Map(lambda x: json.loads(x)))

    order_product = messages | "Extract id" >> beam.Map(lambda x:
                                                        (x.get("order_id"), x))

    def group_products(order_products):
        order_id, products = order_products
        output = {"order_id": str(order_id), "product": []}
        logging.info("order_id: {}".format(str(order_id)))
        for product in products:
            output["device_id"] = product.pop("device_id")
            product.pop("order_id")
            output["product"] = output["product"] + [product]
        return output

    orders = (order_product
              | beam.WindowInto(window.Sessions(500))
              | "Group by order" >> beam.GroupByKey()
              | "Join orders" >> beam.Map(group_products))

    # output = (orders
    #           | "Format orders" >> beam.Map(format_orders))

    orders | WriteToBigQuery(
        args.table,
        args.dataset,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    p.run()

Exemple #10

0

Afficher le fichier

Fichier : LogAnalysisOnBeam.py Projet : RinzlerTron/LogAnalyzer

def main(unused_argv):
  log_analysis_pipeline = beam.Pipeline(options=PipelineOptions())
  pipeline_input_logs = get_log_file()
  # Pipeline for Analyzing Logs
  sessionization= (log_analysis_pipeline 
                | 'Read Logs'
                    >> beam.io.ReadFromText(pipeline_input_logs)
                | 'Parse Logs' 
                    >> beam.ParDo(ParseLBLogs())
                | 'Add Windowing by Event Timestamp' 
                    >> beam.Map(lambda x: 
                                beam.window.TimestampedValue(x, x['timestamp']))
                | 'Client IP representing User as Key'      
                    >> beam.Map(lambda x: (x['user'], x))
                | 'Compute User Session Window'   
                    >> beam.WindowInto(window.Sessions(FLAGS.session_gap), 
                      timestamp_combiner=window.TimestampCombiner.OUTPUT_AT_EOW) \
                | 'Group by Client IP' 
                    >> beam.GroupByKey()
                | 'Sessionize by IP' 
                    >> beam.FlatMap(sessionize)
  )
                        
  sessions_to_file = (sessionization
                      | 'Create DataFrames' 
                        >> beam.Map(convert_to_dataframe)
                      | 'Global Window'  
                        >> beam.WindowInto(beam.window.GlobalWindows())
                      | 'Combine To List' 
                        >>  beam.combiners.ToList()
  # This step would be replaced by Write IO to external sink in production pipeline
                      | 'Merge DataFrames' 
                        >> beam.ParDo(MergeDataframes()) 
                      | 'Write results'
                        >> beam.io.WriteToText('session_object', num_shards=1) 
  )

  average_session_time = (sessionization 
                | 'ExtractAndSumSessions' 
                    >> ExtractAndSumSessions('user')
                | 'format_user_engagement'
                    >> beam.Map(format_user_engagement)
                | 'Combine global session durations'
                    >>beam.CombineGlobally(
                        beam.combiners.MeanCombineFn()).without_defaults()
                | 'Format Avg Session Length' 
                    >> beam.Map(lambda elem: {'average_session_duration': 
                                              float(elem)})
                | 'Print average session duration as output' 
                    >> beam.Map(print)
  )
  # Pipeline for computing Top n Users
  top_n_pipeline = beam.Pipeline(options=PipelineOptions())
  order_by_session = (top_n_pipeline
    | 'Read Parquet File' 
      >> beam.io.ReadFromParquet(FLAGS.input_path)
    | 'User and Session and KV pair' 
      >> beam.FlatMap(
          lambda row: [((str(row['user']), 
                         (row['session_duration'])
                         ))])
    | 'Apply Fixed Window' >> beam.WindowInto(
              beam.window.FixedWindows(size=10*60))
    | 'Top 10 scores' 
        >> beam.CombineGlobally(
        beam.combiners.TopCombineFn(
            n=10, compare=lambda a, b: a[1] < b[1])).without_defaults()
    | 'Print results' >> beam.ParDo(PrintTopNFn()))
  
  # Select pipeline to run based on user specified input
  # Start Top n Users Pipeline
  if FLAGS.input_path:
    top_n_pipeline.run().wait_until_finish()
  # Start Log Analysis Pipeline
  if FLAGS.output_path:
    log_analysis_pipeline.run().wait_until_finish()

Exemple #11

0

Afficher le fichier

Fichier : exercise7.py Projet : phani111/DataflowSME

def Run(argv=None):
    known_args, pipeline_args = ParseArgs(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)
    if known_args.topic:
        pipeline_options.view_as(StandardOptions).streaming = True

    project = pipeline_options.view_as(GoogleCloudOptions).project
    timestamp_attribute = 'timestamp_ms'
    events = None
    if (not known_args.topic or not known_args.play_topic):
        logging.fatal('topic and play_topic are required.')

    events = (p
              | 'read_events' >> ReadFromPubSub(
                  topic=known_args.topic, timestamp_attribute='timestamp_ms')
              | 'parse_events' >> beam.ParDo(ParseEventFn()))

    play_events = (
        p
        | 'read_play_events' >> ReadFromPubSub(
            topic=known_args.play_topic, timestamp_attribute='timestamp_ms')
        | 'parse_play_events' >> beam.ParDo(ParsePlayEventFn()))

    sessionized_events = (
        events
        | 'key_events_by_id' >> beam.Map(lambda x: (x.event_id, x))
        | 'sessionize_events' >> beam.WindowInto(
            window.Sessions(float(known_args.session_gap))))

    sessionized_plays = (
        play_events
        | 'key_plays_by_id' >> beam.Map(lambda x: (x.event_id, x))
        | 'sessionize_plays' >> beam.WindowInto(
            window.Sessions(float(known_args.session_gap))))

    per_user_latency = ({
        'plays': sessionized_plays,
        'events': sessionized_events
    }
                        | 'cbk' >> beam.CoGroupByKey()
                        | 'compute_latency' >> beam.ParDo(ComputeLatency()))

    mean_latency = (
        per_user_latency
        | 'extract_latencies' >> beam.Values()
        | 'global_window' >> beam.WindowInto(
            window.GlobalWindows(),
            trigger=trigger.Repeatedly(trigger.AfterCount(1000)),
            accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
        | 'compute_mean' >> beam.CombineGlobally(beam.combiners.MeanCombineFn(
        )).with_fanout(16).as_singleton_view())

    _ = (per_user_latency
         | 'detect_bad_users' >> beam.ParDo(DetectBadUsers(),
                                            mean_latency=mean_latency)
         | 'filter_duplicates' >> beam.WindowInto(
             window.GlobalWindows(),
             trigger=trigger.AfterCount(1),
             accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
         | 'to_bq_schema' >> beam.Map(lambda x: {'user': x})
         | 'write_bad_users' >> beam.io.WriteToBigQuery(
             known_args.output_tablename, known_args.output_dataset, project,
             ('user:string')))

    p.run().wait_until_finish()

Exemple #12

0

Afficher le fichier

Fichier : streaming_pipeline.py Projet : matthiasa4/beam-demo-python

def run(argv=None):
    # Add command line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--output',
        required=True,
        help=
        'Output BigQuery table for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE.'
    )

    parser.add_argument(
        '--input_subscription',
        required=True,
        help=
        'Input PubSub subscription of the form "projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'
    )

    parser.add_argument(
        '--output_subscription',
        required=True,
        help=
        'Output PubSub subscription of the form "projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'
    )

    known_args, pipeline_args = parser.parse_known_args(argv)

    # Set pipeline options
    pipeline_options = PipelineOptions(pipeline_args)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=pipeline_options)

    # Main pipeline: read in Logs, write them to BigQuery
    message_table = 'logs'
    messages = (p
                | 'Read from PubSub' >> beam.io.ReadFromPubSub(
                    subscription=known_args.input_subscription).
                with_output_types(bytes)
                | 'Decode messages' >> beam.Map(lambda x: x.decode('utf-8'))
                | 'Parse messages to Logs ' >> beam.ParDo(MessageToLog())
                | 'Detect language' >> beam.ParDo(TranslateMessage()))

    (messages | 'Convert Log to BigQuery records' >> beam.Map(
        json_to_bqrecords.json_to_bqrecord)
     | 'Write Logs to BigQuery' >> beam.io.WriteToBigQuery(
         known_args.output + message_table,
         schema=json_schema.log_table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    # Calculate aggregates per language, write to BigQuery
    language_aggregate_table = 'languages'
    languages = (messages | 'Extract language tuple' >>
                 (beam.Map(lambda x: (x.translate_language, x)))
                 | 'Assign Fixed Windows' >> beam.WindowInto(
                     window.FixedWindows(60, 0),
                     trigger=trigger.AfterWatermark(),
                     accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
                 | 'GroupByKey Languages' >> beam.GroupByKey()
                 | 'Count languages' >> beam.ParDo(LanguageAggregate()))

    (languages | 'Convert language aggregate to BigQuery records' >> beam.Map(
        json_to_bqrecords.language_aggregate_to_bqrecords)
     | 'Write LanguageAggregate to BigQuery' >> beam.io.WriteToBigQuery(
         known_args.output + language_aggregate_table,
         schema=json_schema.language_table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    (languages | 'Convert language aggregate to PubSub message' >> beam.Map(
        json_to_bqrecords.language_aggregate_to_pubsubmessage)
     | 'Encode' >> beam.Map(lambda x: json.dumps(x, ensure_ascii=False).encode(
         'utf-8')).with_output_types(bytes)
     | 'Write LanguageAggregate to PubSub' >> beam.io.WriteToPubSub(
         known_args.output_subscription))

    # Calculate aggregates per user, write to
    user_aggregate_table = 'users'
    (messages | 'Extract user tuple' >> (beam.Map(lambda x: (x.user_id, x)))
     | 'Assign Sessions' >> beam.WindowInto(
         window.Sessions(30),
         trigger=trigger.AfterWatermark(),
         accumulation_mode=trigger.AccumulationMode.ACCUMULATING)
     | 'GroupByKey Users' >> beam.GroupByKey()
     | 'Count user' >> beam.ParDo(UserAggregate())
     | 'Convert user aggregate to BigQuery records' >> beam.Map(
         json_to_bqrecords.user_aggregate_to_bqrecords)
     | 'Write UserAggregate to BigQuery' >> beam.io.WriteToBigQuery(
         known_args.output + user_aggregate_table,
         schema=json_schema.user_table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    result = p.run()
    result.wait_until_finish()

Exemple #13

0

Afficher le fichier

Fichier : main.py Projet : konosp/Web-Analytics-Visits-re-processing

def run(argv=None):
    """Main entry point; defines and runs the visitor analysis pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='data/sample.tsv',
                        # default='gs://visit-analysis/raw-data/encoded_feeds/*',
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        # CHANGE 1/5: The Google Cloud Storage path is required
                        # for outputting the results.
                        default='data/output/',
                        # default='gs://visit-analysis/new-visits/',
                        help='Output path to write results to.')
    parser.add_argument('--runner',
                        dest='runner',
                        default='DataflowRunner',
                        required='True',
                        help='DirectRunner or DataflowRunner')
    
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=' + known_args.runner,
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=test-r-big-query',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://feeddata-test-konos-1/visitor/staging/',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://feeddata-test-konos-1/visitor/tmp/',
        '--job_name=visitor_analysis',
    ])
    
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    session_timeout_seconds = int(60 * 30)

    with beam.Pipeline(options=pipeline_options) as p:
        # Initial pipeline - Common processing for all exports
        data = (p | 'Read data' >> ReadFromText(known_args.input)
            | 'Filter & Extract data' >> beam.ParDo(extract_data()))
        
        visitor_data = data | 'Extract Visitor information' >> beam.ParDo(extract_visitor_data())
        
        data = (data | 'Add timestamp' >> beam.ParDo(AddTimestampDoFn())
            | 'Re-assess Sessions: ' + str(session_timeout_seconds) + ' seconds timeout' >> beam.WindowInto(window.Sessions(session_timeout_seconds))
            | 'Group data' >> beam.GroupByKey()
            | 'Calculate visit timestamps' >> beam.ParDo(calc_timestamps_group_hits_by_visit()))
        # Duplicate formated data into two streams for separate additional processing
        hit_data = data 
        visit_data = data
        # Start processing for hits/visits
        visit_data = visit_data | 'Extract Visit information' >> beam.ParDo(extract_visit_data())
        hit_data = hit_data | 'Extract Hit information' >> beam.ParDo(extract_hit_data())
        
        hit_data = hit_data | 'Split hits in multiple lines' >> beam.ParDo(split_hits_into_lines())
        hit_data = hit_data | 'Final format - Hits' >> beam.ParDo(reformat_into_csv_hits())
        hit_data | 'Output - Hits' >>  WriteToText(known_args.output + 'hits/hits.csv')

        visit_data = visit_data | 'Final format - Visits' >> beam.ParDo(reformat_into_csv_visits())
        visit_data | 'Output - Visits' >>  WriteToText(known_args.output + 'visits/visits.csv')
        
        visitor_data | 'Output - Visitors' >>  WriteToText(known_args.output + 'visitors/visitors.csv')