Beispiel #1
0
        class GenerateRecords(beam.DoFn):

            EMIT_TIMER = TimerSpec('emit_timer', TimeDomain.REAL_TIME)
            COUNT_STATE = CombiningValueStateSpec('count_state', VarIntCoder(),
                                                  CountCombineFn())

            def __init__(self, frequency, total_records):
                self.total_records = total_records
                self.frequency = frequency

            def process(self,
                        element,
                        emit_timer=beam.DoFn.TimerParam(EMIT_TIMER)):
                # Processing time timers should be set on ABSOLUTE TIME.
                emit_timer.set(self.frequency)
                yield element[1]

            @on_timer(EMIT_TIMER)
            def emit_values(self,
                            emit_timer=beam.DoFn.TimerParam(EMIT_TIMER),
                            count_state=beam.DoFn.StateParam(COUNT_STATE)):
                count = count_state.read() or 0
                if self.total_records == count:
                    return

                count_state.add(1)
                # Processing time timers should be set on ABSOLUTE TIME.
                emit_timer.set(count + 1 + self.frequency)
                yield 'value'
Beispiel #2
0
def _pardo_group_into_batches(batch_size, input_coder):
  ELEMENT_STATE = BagStateSpec('values', input_coder)
  COUNT_STATE = CombiningValueStateSpec('count', input_coder, CountCombineFn())
  EXPIRY_TIMER = TimerSpec('expiry', TimeDomain.WATERMARK)

  class _GroupIntoBatchesDoFn(DoFn):

    def process(self, element,
                window=DoFn.WindowParam,
                element_state=DoFn.StateParam(ELEMENT_STATE),
                count_state=DoFn.StateParam(COUNT_STATE),
                expiry_timer=DoFn.TimerParam(EXPIRY_TIMER)):
      # Allowed lateness not supported in Python SDK
      # https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data
      expiry_timer.set(window.end)
      element_state.add(element)
      count_state.add(1)
      count = count_state.read()
      if count >= batch_size:
        batch = [element for element in element_state.read()]
        yield batch
        element_state.clear()
        count_state.clear()

    @on_timer(EXPIRY_TIMER)
    def expiry(self, element_state=DoFn.StateParam(ELEMENT_STATE),
               count_state=DoFn.StateParam(COUNT_STATE)):
      batch = [element for element in element_state.read()]
      if batch:
        yield batch
        element_state.clear()
        count_state.clear()

  return _GroupIntoBatchesDoFn()
Beispiel #3
0
def _pardo_group_into_batches(
    input_coder, batch_size, max_buffering_duration_secs, clock=time.time):
  ELEMENT_STATE = BagStateSpec('values', input_coder)
  COUNT_STATE = CombiningValueStateSpec('count', input_coder, CountCombineFn())
  WINDOW_TIMER = TimerSpec('window_end', TimeDomain.WATERMARK)
  BUFFERING_TIMER = TimerSpec('buffering_end', TimeDomain.REAL_TIME)

  class _GroupIntoBatchesDoFn(DoFn):
    def process(
        self,
        element,
        window=DoFn.WindowParam,
        element_state=DoFn.StateParam(ELEMENT_STATE),
        count_state=DoFn.StateParam(COUNT_STATE),
        window_timer=DoFn.TimerParam(WINDOW_TIMER),
        buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)):
      # Allowed lateness not supported in Python SDK
      # https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data
      window_timer.set(window.end)
      element_state.add(element)
      count_state.add(1)
      count = count_state.read()
      if count == 1 and max_buffering_duration_secs > 0:
        # This is the first element in batch. Start counting buffering time if a
        # limit was set.
        # pylint: disable=deprecated-method
        buffering_timer.set(clock() + max_buffering_duration_secs)
      if count >= batch_size:
        return self.flush_batch(element_state, count_state, buffering_timer)

    @on_timer(WINDOW_TIMER)
    def on_window_timer(
        self,
        element_state=DoFn.StateParam(ELEMENT_STATE),
        count_state=DoFn.StateParam(COUNT_STATE),
        buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)):
      return self.flush_batch(element_state, count_state, buffering_timer)

    @on_timer(BUFFERING_TIMER)
    def on_buffering_timer(
        self,
        element_state=DoFn.StateParam(ELEMENT_STATE),
        count_state=DoFn.StateParam(COUNT_STATE),
        buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)):
      return self.flush_batch(element_state, count_state, buffering_timer)

    def flush_batch(self, element_state, count_state, buffering_timer):
      batch = [element for element in element_state.read()]
      if not batch:
        return
      key, _ = batch[0]
      batch_values = [v for (k, v) in batch]
      element_state.clear()
      count_state.clear()
      buffering_timer.clear()
      yield key, batch_values

  return _GroupIntoBatchesDoFn()
    def expand(self, pcoll):

        output = (pcoll
                  | "ParseJson" >> beam.ParDo(JsonToTaxiRide())
                  | "FilterForPickups" >>
                  beam.Filter(lambda x: x.ride_status == 'pickup')
                  | "WindowByMinute" >> beam.WindowInto(
                      beam.window.FixedWindows(60),
                      trigger=AfterWatermark(late=AfterCount(1)),
                      allowed_lateness=60,
                      accumulation_mode=AccumulationMode.ACCUMULATING)
                  | "CountPerMinute" >> beam.CombineGlobally(
                      CountCombineFn()).without_defaults())

        return output
    class StatefulBufferingFn(DoFn):
        BUFFER_STATE = BagStateSpec('buffer', StrUtf8Coder())
        COUNT_STATE = userstate.CombiningValueStateSpec(
            'count', VarIntCoder(), CountCombineFn())

        def process(self,
                    element,
                    buffer_state=beam.DoFn.StateParam(BUFFER_STATE),
                    count_state=beam.DoFn.StateParam(COUNT_STATE)):

            key, value = element
            try:
                index_value = list(buffer_state.read()).index(value)
            except:
                index_value = -1
            if index_value < 0:
                buffer_state.add(value)
                index_value = count_state.read()
                count_state.add(1)

            # print(value, list(buffer_state.read()).index(value), list(buffer_state.read()))
            yield ('{}_{}'.format(value, index_value), 1)
def run():
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Load from Json into BigQuery')
    parser.add_argument('--project',
                        required=True,
                        help='Specify Google Cloud project')
    parser.add_argument('--region',
                        required=True,
                        help='Specify Google Cloud region')
    parser.add_argument('--staging_location',
                        required=True,
                        help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location',
                        required=True,
                        help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner',
                        required=True,
                        help='Specify Apache Beam Runner')
    parser.add_argument('--input_path',
                        required=True,
                        help='Path to events.json')
    parser.add_argument('--table_name',
                        required=True,
                        help='BigQuery table name')

    opts = parser.parse_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(save_main_session=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(
        GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format(
        'batch-user-traffic-pipeline-', time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_path = opts.input_path
    table_name = opts.table_name

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "total_bytes",
                "type": "INTEGER"
            },
            {
                "name": "max_bytes",
                "type": "INTEGER"
            },
            {
                "name": "min_bytes",
                "type": "INTEGER"
            },
        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)

    (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path)
     | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog)
     | 'PerUserAggregations' >> beam.GroupBy('user_id').aggregate_field(
         'user_id', CountCombineFn(), 'page_views').aggregate_field(
             'num_bytes', sum, 'total_bytes').aggregate_field(
                 'num_bytes', max, 'max_bytes').aggregate_field(
                     'num_bytes', min,
                     'min_bytes').with_output_types(PerUserAggregation)
     | 'ToDict' >> beam.Map(to_dict)
     | 'WriteToBQ' >> beam.io.WriteToBigQuery(
         table_name,
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()
def run():
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Load from Json into BigQuery')
    parser.add_argument('--project',
                        required=True,
                        help='Specify Google Cloud project')
    parser.add_argument('--region',
                        required=True,
                        help='Specify Google Cloud region')
    parser.add_argument('--staging_location',
                        required=True,
                        help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location',
                        required=True,
                        help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner',
                        required=True,
                        help='Specify Apache Beam Runner')
    parser.add_argument('--input_path',
                        required=True,
                        help='Path to events.json')
    parser.add_argument('--table_name',
                        required=True,
                        help='BigQuery table name')

    opts = parser.parse_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(save_main_session=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(
        GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format(
        'batch-minute-traffic-pipeline-', time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_path = opts.input_path
    table_name = opts.table_name

    # Table schema for BigQuery
    table_schema = {
        "fields": [
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },
        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)

    (p | 'ReadFromGCS' >> beam.io.ReadFromText(input_path)
     | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog)
     | 'AddEventTimestamp' >> beam.Map(add_timestamp)
     | "WindowByMinute" >> beam.WindowInto(beam.window.FixedWindows(60))
     | "CountPerMinute" >> beam.CombineGlobally(
         CountCombineFn()).without_defaults()
     | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn())
     | 'WriteToBQ' >> beam.io.WriteToBigQuery(
         table_name,
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()
def run():
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Load from PubSub into BigQuery')
    parser.add_argument('--project',
                        required=True,
                        help='Specify Google Cloud project')
    parser.add_argument('--region',
                        required=True,
                        help='Specify Google Cloud region')
    parser.add_argument('--staging_location',
                        required=True,
                        help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location',
                        required=True,
                        help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--accum_mode',
                        required=True,
                        help='Accumulation mode for pipeline')

    opts, pipeline_args = parser.parse_known_args()

    options = PipelineOptions(pipeline_args, save_main_session=True)

    options.view_as(
        GoogleCloudOptions).job_name = f"{opts.accum_mode}-{time.time_ns()}"
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(
        GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    table_schema = {
        "fields": [
            {
                "name": "taxi_events",
                "type": "INTEGER"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },
        ]
    }

    input_topic = "projects/pubsub-public-data/topics/taxirides-realtime"
    output_table = f"{opts.project}:dataflow_demos.{opts.accum_mode}"

    if opts.accum_mode == 'accumulating':
        accum_mode = beam.transforms.trigger.AccumulationMode.ACCUMULATING
    elif opts.accum_mode == 'discarding':
        accum_mode = beam.transforms.trigger.AccumulationMode.DISCARDING
    else:
        raise ValueError(
            'Invalid accumulation mode value. Use \'accumulating\' or \'discarding\' '
        )

    p = beam.Pipeline(options=options)

    (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic)
     | 'ParseJson' >> beam.Map(parse_json).with_output_types(TaxiRide)
     | 'WindowByMinute' >> beam.WindowInto(
         beam.window.FixedWindows(60),
         trigger=AfterWatermark(early=AfterProcessingTime(10)),
         accumulation_mode=accum_mode)
     | "CountPerMinute" >> beam.CombineGlobally(
         CountCombineFn()).without_defaults()
     | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn())
     | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
         output_table,
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()
Beispiel #9
0
def run():
    # Command line arguments
    parser = argparse.ArgumentParser(description='Load from Json from Pub/Sub into BigQuery')
    parser.add_argument('--project',required=True, help='Specify Google Cloud project')
    parser.add_argument('--region', required=True, help='Specify Google Cloud region')
    parser.add_argument('--staging_location', required=True, help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location', required=True, help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--runner', required=True, help='Specify Apache Beam Runner')
    parser.add_argument('--input_topic', required=True, help='Input Pub/Sub Topic')
    parser.add_argument('--agg_table_name', required=True, help='BigQuery table name for aggregate results')
    parser.add_argument('--raw_table_name', required=True, help='BigQuery table name for raw inputs')
    parser.add_argument('--window_duration', required=True, help='Window duration')

    opts = parser.parse_args()

    # Setting up the Beam pipeline options
    options = PipelineOptions(save_main_session=True, streaming=True)
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(GoogleCloudOptions).job_name = '{0}{1}'.format('streaming-minute-traffic-pipeline-',time.time_ns())
    options.view_as(StandardOptions).runner = opts.runner

    input_topic = opts.input_topic
    raw_table_name = opts.raw_table_name
    agg_table_name = opts.agg_table_name
    window_duration = opts.window_duration

    # Table schema for BigQuery
    agg_table_schema = {
        "fields": [
            {
                "name": "page_views",
                "type": "INTEGER"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },

        ]
    }

    raw_table_schema = {
        "fields": [
            {
                "name": "ip",
                "type": "STRING"
            },
            {
                "name": "user_id",
                "type": "STRING"
            },
            {
                "name": "user_agent",
                "type": "STRING"
            },
            {
                "name": "lat",
                "type": "FLOAT",
                "mode": "NULLABLE"
            },
            {
                "name": "lng",
                "type": "FLOAT",
                "mode": "NULLABLE"
            },
            {
                "name": "event_timestamp",
                "type": "STRING"
            },
            {
                "name": "processing_timestamp",
                "type": "STRING"
            },
            {
                "name": "http_request",
                "type": "STRING"
            },
            {
                "name": "http_response",
                "type": "INTEGER"
            },
            {
                "name": "num_bytes",
                "type": "INTEGER"
            }
        ]
    }

    # Create the pipeline
    p = beam.Pipeline(options=options)



    parsed_msgs = (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic)
                     | 'ParseJson' >> beam.Map(parse_json).with_output_types(CommonLog))

    (parsed_msgs
        | "AddProcessingTimestamp" >> beam.Map(add_processing_timestamp)
        | 'WriteRawToBQ' >> beam.io.WriteToBigQuery(
            raw_table_name,
            schema=raw_table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
            )
        )

    (parsed_msgs
        | "WindowByMinute" >> beam.WindowInto(beam.window.FixedWindows(60))
        | "CountPerMinute" >> beam.CombineGlobally(CountCombineFn()).without_defaults()
        | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn())
        | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
            agg_table_name,
            schema=agg_table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
            )
    )

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run().wait_until_finish()
Beispiel #10
0
def _pardo_group_into_batches_with_multi_bags(
  input_coder, batch_size, max_buffering_duration_secs, clock=time.time):
  ELEMENT_STATE_0 = BagStateSpec('values0', input_coder)
  ELEMENT_STATE_1 = BagStateSpec('values1', input_coder)
  ELEMENT_STATE_2 = BagStateSpec('values2', input_coder)
  ELEMENT_STATE_3 = BagStateSpec('values3', input_coder)
  COUNT_STATE = CombiningValueStateSpec('count', input_coder, CountCombineFn())
  WINDOW_TIMER = TimerSpec('window_end', TimeDomain.WATERMARK)
  BUFFERING_TIMER = TimerSpec('buffering_end', TimeDomain.REAL_TIME)

  class _GroupIntoBatchesDoFnWithMultiBags(DoFn):
    def process(
        self,
        element,
        window=DoFn.WindowParam,
        element_state_0=DoFn.StateParam(ELEMENT_STATE_0),
        element_state_1=DoFn.StateParam(ELEMENT_STATE_1),
        element_state_2=DoFn.StateParam(ELEMENT_STATE_2),
        element_state_3=DoFn.StateParam(ELEMENT_STATE_3),
        count_state=DoFn.StateParam(COUNT_STATE),
        window_timer=DoFn.TimerParam(WINDOW_TIMER),
        buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)):
      # Allowed lateness not supported in Python SDK
      # https://beam.apache.org/documentation/programming-guide/#watermarks-and-late-data
      window_timer.set(window.end)

      count_state.add(1)
      count = count_state.read()

      element_states = [element_state_0, element_state_1, element_state_2, element_state_3]
      element_states[count % 4].add(element)

      if count == 1 and max_buffering_duration_secs > 0:
        # This is the first element in batch. Start counting buffering time if a
        # limit was set.
        buffering_timer.set(clock() + max_buffering_duration_secs)
      if count >= batch_size:
        return self.flush_batch(element_states, count_state, buffering_timer)

    @on_timer(WINDOW_TIMER)
    def on_window_timer(
        self,
        element_state_0=DoFn.StateParam(ELEMENT_STATE_0),
        element_state_1=DoFn.StateParam(ELEMENT_STATE_1),
        element_state_2=DoFn.StateParam(ELEMENT_STATE_2),
        element_state_3=DoFn.StateParam(ELEMENT_STATE_3),
        count_state=DoFn.StateParam(COUNT_STATE),
        buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)):

      element_states = [element_state_0, element_state_1, element_state_2, element_state_3]
      return self.flush_batch(element_states, count_state, buffering_timer)

    @on_timer(BUFFERING_TIMER)
    def on_buffering_timer(
        self,
        element_state_0=DoFn.StateParam(ELEMENT_STATE_0),
        element_state_1=DoFn.StateParam(ELEMENT_STATE_1),
        element_state_2=DoFn.StateParam(ELEMENT_STATE_2),
        element_state_3=DoFn.StateParam(ELEMENT_STATE_3),
        count_state=DoFn.StateParam(COUNT_STATE),
        buffering_timer=DoFn.TimerParam(BUFFERING_TIMER)):

      element_states = [element_state_0, element_state_1, element_state_2, element_state_3]
      return self.flush_batch(element_states, count_state, buffering_timer)

    def flush_batch(self, element_states, count_state, buffering_timer):
      batch_values = []
      for element_state in element_states:
        for k, v in element_state.read():
          key = k
          batch_values.append(v)
        element_state.clear()

      count_state.clear()
      buffering_timer.clear()

      if not batch_values:
        return

      yield key, batch_values

  return _GroupIntoBatchesDoFnWithMultiBags()