Beispiel #1
0
 def test_equal_to_per_window_fail_unmatched_window(self):
     with self.assertRaises(BeamAssertException):
         expected = {
             window.IntervalWindow(50, 100): [('k', [1])],
         }
         with TestPipeline(options=StandardOptions(streaming=True)) as p:
             assert_that(
                 (p
                  | Create([1])
                  | beam.WindowInto(
                      FixedWindows(20),
                      trigger=trigger.AfterWatermark(),
                      accumulation_mode=trigger.AccumulationMode.DISCARDING)
                  | beam.Map(lambda x: ('k', x))
                  | beam.GroupByKey()),
                 equal_to_per_window(expected),
                 reify_windows=True)
def run(argv=None):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--stream',
                        type=str,
                        help='Pub/Sub topic to read from')
    parser.add_argument(
        '--sink',
        help=('Output BigQuery table for windowed averages specified as: '
              'PROJECT:DATASET.TABLE or DATASET.TABLE.'))

    args, pipeline_args = parser.parse_known_args(argv)

    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    options.view_as(StandardOptions).streaming = True

    p = beam.Pipeline(options=options)

    records = (p |
               'Read from PubSub' >> beam.io.ReadFromPubSub(topic=args.stream)
               | 'Parse JSON to Dict' >> beam.Map(json.loads))
    """
            # Write to the warehouse table
    records | 'Write to BigQuery' >> beam.io.WriteToBigQuery(
        args.sink,
        schema=Schema.get_warehouse_schema(),
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    """

    # Compute average in a sliding window and write to BQ average table
    (records | 'Add timestamp' >> beam.ParDo(AddTimestampToDict())
     | 'Window' >> beam.WindowInto(beam.window.SlidingWindows(10, 1, offset=0))
     | 'Dict to KeyValue' >> beam.ParDo(AddKeyToDict())
     | 'Group by Key' >> beam.GroupByKey()
     | 'Average' >> beam.ParDo(CountAverages())
     | 'Write Avg to BigQuery' >> beam.io.WriteToBigQuery(
         args.sink,
         schema=Schema.get_warehouse_schema(),
         create_disposition=BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=BigQueryDisposition.WRITE_APPEND))

    result = p.run()
    result.wait_until_finish()
Beispiel #3
0
 def test_setting_session_windows(self):
   with TestPipeline() as p:
     unkeyed_items = p | beam.Create([2, 11, 16, 27])
     items = (unkeyed_items
              | 'key' >> beam.Map(
                  lambda x: beam.window.TimestampedValue(('k', x), x)))
     # [START setting_session_windows]
     from apache_beam import window
     session_windowed_items = (
         items | 'window' >> beam.WindowInto(window.Sessions(10)))
     # [END setting_session_windows]
     summed = (session_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     assert_that(unkeyed,
                 equal_to([29, 27]))
Beispiel #4
0
    def _write_files(self, destination_data_kv_pc, file_prefix_pcv):
        outputs = (destination_data_kv_pc
                   | beam.ParDo(WriteRecordsToFile(
                       max_files_per_bundle=self.max_files_per_bundle,
                       max_file_size=self.max_file_size,
                       coder=self.coder),
                                file_prefix=file_prefix_pcv).with_outputs(
                                    WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                                    WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        if self.is_streaming_pipeline:
            # Apply the user's trigger back before we start triggering load jobs
            all_destination_file_pairs_pc = (
                all_destination_file_pairs_pc
                | "ApplyUserTrigger" >> beam.WindowInto(
                    beam.window.GlobalWindows(),
                    trigger=trigger.Repeatedly(
                        trigger.AfterAll(
                            trigger.AfterProcessingTime(
                                self.triggering_frequency),
                            trigger.AfterCount(1))),
                    accumulation_mode=trigger.AccumulationMode.DISCARDING))
        return all_destination_file_pairs_pc
Beispiel #5
0
 def test_reshuffle_global_window(self):
     pipeline = TestPipeline()
     data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
     expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])]
     before_reshuffle = (pipeline
                         | beam.Create(data)
                         | beam.WindowInto(GlobalWindows())
                         | beam.GroupByKey()
                         | beam.MapTuple(lambda k, vs: (k, sorted(vs))))
     assert_that(before_reshuffle,
                 equal_to(expected_data),
                 label='before_reshuffle')
     after_reshuffle = before_reshuffle | beam.Reshuffle()
     assert_that(after_reshuffle,
                 equal_to(expected_data),
                 label='after reshuffle')
     pipeline.run()
Beispiel #6
0
 def test_after_count(self):
   p = Pipeline('DirectRunner')
   result = (p
             | beam.Create([1, 2, 3, 4, 5, 10, 11])
             | beam.FlatMap(lambda t: [('A', t), ('B', t + 5)])
             | beam.Map(lambda (k, t): TimestampedValue((k, t), t))
             | beam.WindowInto(FixedWindows(10), trigger=AfterCount(3),
                               accumulation_mode=AccumulationMode.DISCARDING)
             | beam.GroupByKey()
             | beam.Map(lambda (k, v): ('%s-%s' % (k, len(v)), set(v))))
   assert_that(result, equal_to(
       {
           'A-5': {1, 2, 3, 4, 5},
           # A-10, A-11 never emitted due to AfterCount(3) never firing.
           'B-4': {6, 7, 8, 9},
           'B-3': {10, 15, 16},
       }.iteritems()))
Beispiel #7
0
 def test_reshuffle_streaming_global_window(self):
   options = PipelineOptions()
   options.view_as(StandardOptions).streaming = True
   with TestPipeline(options=options) as pipeline:
     data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
     expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])]
     before_reshuffle = (
         pipeline
         | beam.Create(data)
         | beam.WindowInto(GlobalWindows())
         | beam.GroupByKey()
         | beam.MapTuple(lambda k, vs: (k, sorted(vs))))
     assert_that(
         before_reshuffle, equal_to(expected_data), label='before_reshuffle')
     after_reshuffle = before_reshuffle | beam.Reshuffle()
     assert_that(
         after_reshuffle, equal_to(expected_data), label='after reshuffle')
Beispiel #8
0
 def test_setting_fixed_windows(self):
     with TestPipeline() as p:
         unkeyed_items = p | beam.Create([22, 33, 55, 100, 115, 120])
         items = (unkeyed_items
                  |
                  'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                      ('k', x), x)))
         # [START setting_fixed_windows]
         from apache_beam import window
         fixed_windowed_items = (
             items | 'window' >> beam.WindowInto(window.FixedWindows(60)))
         # [END setting_fixed_windows]
         summed = (fixed_windowed_items
                   | 'group' >> beam.GroupByKey()
                   | 'combine' >> beam.CombineValues(sum))
         unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
         assert_that(unkeyed, equal_to([110, 215, 120]))
Beispiel #9
0
def load(events, metadata=None):
  return (
      events
      | nexmark_query_util.JustBids()
      | 'query12_extract_bidder' >> beam.Map(lambda bid: bid.bidder)
      # windowing with processing time trigger, currently not supported in batch
      | beam.WindowInto(
          window.GlobalWindows(),
          trigger=trigger.Repeatedly(
              trigger.AfterProcessingTime(metadata.get('window_size_sec'))),
          accumulation_mode=trigger.AccumulationMode.DISCARDING,
          allowed_lateness=0)
      | 'query12_bid_count' >> beam.combiners.Count.PerElement()
      | 'query12_output' >> beam.Map(
          lambda t: {
              ResultNames.BIDDER_ID: t[0], ResultNames.BID_COUNT: t[1]
          }))
Beispiel #10
0
 def test_setting_global_window(self):
     p = TestPipeline()
     unkeyed_items = p | beam.Create([2, 11, 16, 27])
     items = (unkeyed_items
              | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                  ('k', x), x)))
     # [START setting_global_window]
     from apache_beam import window
     session_windowed_items = (
         items | 'window' >> beam.WindowInto(window.GlobalWindows()))
     # [END setting_global_window]
     summed = (session_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     beam.assert_that(unkeyed, beam.equal_to([56]))
     p.run()
Beispiel #11
0
 def test_setting_sliding_windows(self):
     p = TestPipeline()
     unkeyed_items = p | beam.Create([2, 16, 23])
     items = (unkeyed_items
              | 'key' >> beam.Map(lambda x: beam.window.TimestampedValue(
                  ('k', x), x)))
     # [START setting_sliding_windows]
     from apache_beam import window
     sliding_windowed_items = (
         items | 'window' >> beam.WindowInto(window.SlidingWindows(30, 5)))
     # [END setting_sliding_windows]
     summed = (sliding_windowed_items
               | 'group' >> beam.GroupByKey()
               | 'combine' >> beam.CombineValues(sum))
     unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
     assert_that(unkeyed, equal_to([2, 2, 2, 18, 23, 39, 39, 39, 41, 41]))
     p.run()
Beispiel #12
0
def side_input_slow_update(
    src_file_pattern,
    first_timestamp,
    last_timestamp,
    interval,
    sample_main_input_elements,
    main_input_windowing_interval):
  # [START SideInputSlowUpdateSnip1]
  from apache_beam.transforms.periodicsequence import PeriodicImpulse
  from apache_beam.transforms.window import TimestampedValue
  from apache_beam.transforms import window

  # from apache_beam.utils.timestamp import MAX_TIMESTAMP
  # last_timestamp = MAX_TIMESTAMP to go on indefninitely

  # Any user-defined function.
  # cross join is used as an example.
  def cross_join(left, rights):
    for x in rights:
      yield (left, x)

  # Create pipeline.
  pipeline_options = PipelineOptions()
  p = beam.Pipeline(options=pipeline_options)
  side_input = (
      p
      | 'PeriodicImpulse' >> PeriodicImpulse(
          first_timestamp, last_timestamp, interval, True)
      | 'MapToFileName' >> beam.Map(lambda x: src_file_pattern + str(x))
      | 'ReadFromFile' >> beam.io.ReadAllFromText())

  main_input = (
      p
      | 'MpImpulse' >> beam.Create(sample_main_input_elements)
      |
      'MapMpToTimestamped' >> beam.Map(lambda src: TimestampedValue(src, src))
      | 'WindowMpInto' >> beam.WindowInto(
          window.FixedWindows(main_input_windowing_interval)))

  result = (
      main_input
      | 'ApplyCrossJoin' >> beam.FlatMap(
          cross_join, rights=beam.pvalue.AsIter(side_input)))
  # [END SideInputSlowUpdateSnip1]

  return p, result
Beispiel #13
0
  def test_gbk_execution_no_triggers(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)])
                   .advance_watermark_to_infinity())

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())

    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.

    # assert per window
    expected_window_to_elements = {
        window.IntervalWindow(0, 15): [
            ('k', ['a', 'b', 'c']),
            ('k', ['late']),
        ],
        window.IntervalWindow(15, 30): [
            ('k', ['d', 'e']),
        ],
        window.IntervalWindow(300, 315): [
            ('k', ['last']),
        ],
    }
    assert_that(
        records,
        equal_to_per_window(expected_window_to_elements),
        use_global_window=False,
        label='assert per window')

    p.run()
Beispiel #14
0
 def expand(self, pcoll):
     return (
         pcoll
         # Assigns window info to each Pub/Sub message based on its
         # publish timestamp.
         | "Window into Fixed Intervals"
         >> beam.WindowInto(window.FixedWindows(self.window_size))
         | "Add timestamps to messages" >> beam.ParDo(AddTimestamps())
         # Use a dummy key to group the elements in the same window.
         # Note that all the elements in one window must fit into memory
         # for this. If the windowed elements do not fit into memory,
         # please consider using `beam.util.BatchElements`.
         # https://beam.apache.org/releases/pydoc/current/apache_beam.transforms.util.html#apache_beam.transforms.util.BatchElements
         | "Add Dummy Key" >> beam.Map(lambda elem: (None, elem))
         | "Groupby" >> beam.GroupByKey()
         | "Abandon Dummy Key" >> beam.MapTuple(lambda _, val: val)
     )
Beispiel #15
0
def run(argv=None):
    # Use Python argparse module to parse custom arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://rim-bucket/market.txt',
                        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://rim-bucket/output/',
        help='Output file to write results to.')

    known_args, pipeline_args = parser.parse_known_args(argv)
    p_options = PipelineOptions(pipeline_args)
    google_cloud_options = p_options.view_as(GoogleCloudOptions)
    google_cloud_options.region = 'europe-west1'
    google_cloud_options.project = 'smartlive'
    '''google_cloud_options.job_name = 'dataflow-job-{}'.format(
        datetime.datetime.now().strftime("%Y-%m-%d%H%M%S")
    )'''
    google_cloud_options.staging_location = 'gs://rim-bucket/binaries'
    google_cloud_options.temp_location = 'gs://rim-bucket/temp'

    p_options.view_as(StandardOptions).runner = 'DirectRunner'
    p_options.view_as(SetupOptions).save_main_session = True
    p_options.view_as(StandardOptions).streaming = True
    p_options.view_as(WorkerOptions).subnetwork = (
        'regions/europe-west1/subnetworks/test')
    p = beam.Pipeline(options=p_options)

    lines = p | 'receive_data' >> beam.io.ReadFromText(
        known_args.input)\
        | 'window' >> beam.WindowInto(window.GlobalWindows()) \
        | 'jsonload' >> beam.Map(lambda x: json.loads(x))\
        | 'count' >> beam.Map(lambda x: len(x))\
        | 'printnbrarticles' >> beam.ParDo(PrintFn()) \

    # ----- window fixe + Trigger AfterWatermark + Accumulating mode  ------ #
    (lines | 'CountGlobally' >> beam.CombineGlobally(
        beam.combiners.CountCombineFn()).without_defaults())

    p.run().wait_until_finish()
Beispiel #16
0
    def Do(self, input_dict: Dict[Text, List[types.Artifact]],
           output_dict: Dict[Text, List[types.Artifact]],
           exec_properties: Dict[Text, Any]) -> None:
        self._log_startup(input_dict, output_dict, exec_properties)

        example_uris = {}
        for example in input_dict['examples']:
            for split in artifact_utils.decode_split_names(
                    example.split_names):
                example_uris[split] = os.path.join(example.uri, split)

        model = artifact_utils.get_single_instance(input_dict['model'])
        model_path = path_utils.serving_model_path(model.uri)
        absl.logging.info('Using {} as current model.'.format(model_path))

        output_uri = os.path.join(
            artifact_utils.get_single_uri(output_dict['output_data']),
            'pred.csv')
        with self._make_beam_pipeline() as pipeline:
            test_data = []
            for split, example_uri in example_uris.items():
                test_data.append(pipeline | 'ReadFromTFRecord_{}'.format(
                    split) >> beam.io.ReadFromTFRecord(
                        file_pattern=io_utils.all_files_pattern(example_uri)))

            (test_data | 'Flattern' >> beam.Flatten()
             | 'ParseToExample' >> beam.Map(tf.train.Example.FromString)
             | 'Prediction' >> beam.ParDo(
                 RunModel(model_path, 'serving_default', 'PassengerId'))
             | 'ParseToKVPair' >> beam.Map(lambda x: ParseResultToKV(x))
             | 'AddSameKey' >> beam.Map(lambda x: (1, x))
             | 'Window' >> beam.WindowInto(beam.window.GlobalWindows())
             | 'GroupByKey' >> beam.GroupByKey()
             | 'Sort' >> beam.Map(
                 lambda group_data: sorted(group_data[1], key=lambda x: x[0]))
             | 'Flatten' >> beam.FlatMap(lambda x: x)
             | 'ToStr' >> beam.Map(
                 lambda x: '{},{}'.format(x[0], '0' if x[1] < 0.5 else '1'))
             | 'WriteToFile' >> beam.io.WriteToText(
                 output_uri,
                 num_shards=1,
                 shard_name_template='',
                 header='PassengerId,Survived'))
        absl.logging.info('TestPredComponent result written to %s.',
                          output_uri)
Beispiel #17
0
 def test_reshuffle_window_fn_preserved(self):
     pipeline = TestPipeline()
     data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
     expected_windows = [
         TestWindowedValue(v, t, [w]) for (v, t, w) in
         [((1, 1), 1.0,
           IntervalWindow(1.0, 3.0)), ((2,
                                        1), 1.0, IntervalWindow(1.0, 3.0)),
          ((3, 1), 1.0,
           IntervalWindow(1.0, 3.0)), ((1, 2), 2.0,
                                       IntervalWindow(2.0, 4.0)),
          ((2, 2), 2.0,
           IntervalWindow(2.0, 4.0)), ((1, 4), 4.0,
                                       IntervalWindow(4.0, 6.0))]
     ]
     expected_merged_windows = [
         TestWindowedValue(v, t - .001, [w])
         for (v, t, w) in [((1, contains_in_any_order([2, 1])), 4.0,
                            IntervalWindow(1.0, 4.0)),
                           ((2, contains_in_any_order([2, 1])), 4.0,
                            IntervalWindow(1.0, 4.0)
                            ), ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)
                                ), ((1, [4]), 6.0,
                                    IntervalWindow(4.0, 6.0))]
     ]
     before_reshuffle = (
         pipeline
         | 'start' >> beam.Create(data)
         | 'add_timestamp' >> beam.Map(lambda v: TimestampedValue(v, v[1]))
         | 'window' >> beam.WindowInto(Sessions(gap_size=2)))
     assert_that(before_reshuffle,
                 equal_to(expected_windows),
                 label='before_reshuffle',
                 reify_windows=True)
     after_reshuffle = before_reshuffle | beam.Reshuffle()
     assert_that(after_reshuffle,
                 equal_to(expected_windows),
                 label='after_reshuffle',
                 reify_windows=True)
     after_group = after_reshuffle | beam.GroupByKey()
     assert_that(after_group,
                 equal_to(expected_merged_windows),
                 label='after_group',
                 reify_windows=True)
     pipeline.run()
Beispiel #18
0
 def test_equal_to_per_window_fail_unexpected_element(self):
   with self.assertRaises(BeamAssertException):
     start = int(MIN_TIMESTAMP.micros // 1e6) - 5
     end = start + 20
     expected = {
         window.IntervalWindow(start, end): [('k', [1])],
     }
     with TestPipeline(options=StandardOptions(streaming=True)) as p:
       assert_that((p
                    | Create([1, 2])
                    | beam.WindowInto(
                        FixedWindows(20),
                        trigger=trigger.AfterWatermark(),
                        accumulation_mode=trigger.AccumulationMode.DISCARDING)
                    | beam.Map(lambda x: ('k', x))
                    | beam.GroupByKey()),
                   equal_to_per_window(expected),
                   reify_windows=True)
Beispiel #19
0
 def test_reshuffle_sliding_window(self):
   pipeline = TestPipeline()
   data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
   window_size = 2
   expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])] * window_size
   before_reshuffle = (pipeline
                       | beam.Create(data)
                       | beam.WindowInto(SlidingWindows(
                           size=window_size, period=1))
                       | beam.GroupByKey())
   assert_that(before_reshuffle, equal_to(expected_data),
               label='before_reshuffle')
   after_reshuffle = before_reshuffle | beam.Reshuffle()
   # If Reshuffle applies the sliding window function a second time there
   # should be extra values for each key.
   assert_that(after_reshuffle, equal_to(expected_data),
               label='after reshuffle')
   pipeline.run()
Beispiel #20
0
 def build_read_pipeline(self, pipeline):
     _ = (pipeline
          | 'ReadFromKafka' >> ReadFromKafka(
              consumer_config={
                  'bootstrap.servers': self.bootstrap_servers,
                  'auto.offset.reset': 'earliest'
              },
              topics=[self.topic],
              expansion_service=self.expansion_service)
          | 'Windowing' >> beam.WindowInto(
              beam.window.FixedWindows(300),
              trigger=beam.transforms.trigger.AfterProcessingTime(60),
              accumulation_mode=beam.transforms.trigger.AccumulationMode.
              DISCARDING)
          | 'DecodingValue' >> beam.Map(lambda elem: int(elem[1].decode()))
          |
          'CombineGlobally' >> beam.CombineGlobally(sum).without_defaults()
          | 'SetSumCounter' >> beam.Map(self.sum_counter.inc))
Beispiel #21
0
 def test_reshuffle_streaming_global_window(self):
     options = PipelineOptions()
     options.view_as(StandardOptions).streaming = True
     pipeline = TestPipeline(options=options)
     data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
     expected_data = [(1, [1, 2, 4]), (2, [1, 2]), (3, [1])]
     before_reshuffle = (pipeline
                         | 'start' >> beam.Create(data)
                         | 'window' >> beam.WindowInto(GlobalWindows())
                         | 'group_by_key' >> beam.GroupByKey())
     assert_that(before_reshuffle,
                 equal_to(expected_data),
                 label='before_reshuffle')
     after_reshuffle = (before_reshuffle | 'reshuffle' >> beam.Reshuffle())
     assert_that(after_reshuffle,
                 equal_to(expected_data),
                 label='after reshuffle')
     pipeline.run()
Beispiel #22
0
  def expand(self, pcoll):
    events = pcoll | beam.WindowInto(self.auction_or_bid_windowFn)

    auction_by_id = (
        events
        | nexmark_query_util.JustAuctions()
        | 'auction_by_id' >> beam.ParDo(nexmark_query_util.AuctionByIdFn()))
    bids_by_auction_id = (
        events
        | nexmark_query_util.JustBids()
        | 'bid_by_auction' >> beam.ParDo(nexmark_query_util.BidByAuctionIdFn()))

    return ({
        nexmark_query_util.AUCTION_TAG: auction_by_id,
        nexmark_query_util.BID_TAG: bids_by_auction_id
    }
            | beam.CoGroupByKey()
            | beam.ParDo(JoinAuctionBidFn()))
Beispiel #23
0
def pardo_dofn_params(test=None):
    # [START pardo_dofn_params]
    import apache_beam as beam

    # pylint: disable=line-too-long
    class AnalyzeElement(beam.DoFn):
        def process(self,
                    elem,
                    timestamp=beam.DoFn.TimestampParam,
                    window=beam.DoFn.WindowParam):
            yield '\n'.join([
                '# timestamp',
                'type(timestamp) -> ' + repr(type(timestamp)),
                'timestamp.micros -> ' + repr(timestamp.micros),
                'timestamp.to_rfc3339() -> ' + repr(timestamp.to_rfc3339()),
                'timestamp.to_utc_datetime() -> ' +
                repr(timestamp.to_utc_datetime()),
                '',
                '# window',
                'type(window) -> ' + repr(type(window)),
                'window.start -> {} ({})'.format(
                    window.start, window.start.to_utc_datetime()),
                'window.end -> {} ({})'.format(window.end,
                                               window.end.to_utc_datetime()),
                'window.max_timestamp() -> {} ({})'.format(
                    window.max_timestamp(),
                    window.max_timestamp().to_utc_datetime()),
            ])

    # pylint: enable=line-too-long

    with beam.Pipeline() as pipeline:
        dofn_params = (
            pipeline
            | 'Create a single test element' >> beam.Create([':)'])
            | 'Add timestamp (Spring equinox 2020)' >> beam.Map(
                lambda elem: beam.window.TimestampedValue(elem, 1584675660))
            | 'Fixed 30sec windows' >> beam.WindowInto(
                beam.window.FixedWindows(30))
            | 'Analyze element' >> beam.ParDo(AnalyzeElement())
            | beam.Map(print))
        # [END pardo_dofn_params]
        if test:
            test(dofn_params)
Beispiel #24
0
  def test_gbk_execution_no_triggers(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    global result     # pylint: disable=global-variable-undefined
    result = []

    def fired_elements(elem):
      result.append(elem)
      return elem

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey()
               | beam.Map(fired_elements))
    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.
    assert_that(records, equal_to([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])]))
    p.run()
    # TODO(BEAM-3377): Remove after assert_that in streaming is fixed.
    self.assertEqual([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])], result)
Beispiel #25
0
def run(argv=None):
    from apache_beam.transforms.window import TimestampedValue, FixedWindows

    pubsub_input_topic = 'projects/professionaldataengineercourse/topics/faces_on_images'

    with beam.Pipeline(options=get_pipeline_options()) as pipeline:
        logging.info("pubsub_input_topic = {}".format(pubsub_input_topic))

        json_messages = \
            (pipeline
             | 'ReadFromPubSubTopic' >> beam.io.ReadFromPubSub(topic=pubsub_input_topic).with_output_types(bytes)
             | 'DecodeMessagesFromPubSub' >> beam.Map(decode_message)
             )

        window_size_s = 30
        allowed_lateness_s = 60
        high_confidence_faces_grouped_by_emotion_count_per_window = (
                json_messages
                | 'ParseJsonMessage' >> beam.Map(parse_jsons)
                | 'FilterHighFaceConfidence' >> beam.ParDo(FilterHighConfidenceFacesDoFn())
                | 'FlatMapFAcesWithHighEmotionLikelihood' >> beam.FlatMap(get_faces_with_high_emotion_likelihood)
                | 'UseCustomTimestamp' >> beam.Map(lambda face_info:
                                                   TimestampedValue(face_info, face_info['ts_seconds']))
                | 'WindowFaceInfo' >> beam.WindowInto(
                        FixedWindows(window_size_s, 0),
                        trigger=AfterWatermark(
                            early=AfterAny(AfterCount(5), AfterProcessingTime(10)),
                            late=AfterAll(AfterCount(2), AfterProcessingTime(20))),
                        allowed_lateness=allowed_lateness_s,
                        accumulation_mode=AccumulationMode.DISCARDING)
                | 'PairEmotionWithFace' >> beam.Map(lambda face_info: (face_info['emotion'], face_info))
                | 'GroupByEmotion' >> beam.GroupByKey()
                | 'FormatOutputForBigQuery' >> beam.ParDo(FormatFaceInfoPerWindow())
        )

        log_p_collection(high_confidence_faces_grouped_by_emotion_count_per_window, "OutputToBigQuery")

        high_confidence_faces_grouped_by_emotion_count_per_window | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
            bq_faces_windowed_table_name,
            schema={"fields": bq_faces_windowed_table_schema},
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

        pipeline_result = pipeline.run()
        pipeline_result.wait_until_finish()
Beispiel #26
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--output_topic',
        default='projects/complete-rush-206308/topics/salesstream',
        help=('Output PubSub topic of the form '
              '"projects/<PROJECT>/topic/<TOPIC>".'))
    parser.add_argument(
        '--input_topic',
        default='projects/complete-rush-206308/topics/salesstream',
        help=('Input PubSub topic of the form '))
    parser.add_argument(
        '--input_subscription',
        default='projects/complete-rush-206308/subscriptions/salesReceiver',
        help=('Input PubSub subscription of the form '))
    parser.add_argument('--output',
                        dest='output',
                        default='gs://sales_bkt/output/',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--runner=DataflowRunner',
        '--project=complete-rush-206308',
        '--staging_location=gs://sales_bkt/stg',
        '--temp_location=gs://sales_bkt/tmp',
        '--job_name=myslaesprostream',
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True

    with beam.Pipeline(options=pipeline_options) as p:
        output = (
            p  #| beam.io.ReadStringsFromPubSub(subscription=known_args.input_subscription)
            | beam.io.ReadStringsFromPubSub(topic=known_args.input_topic)
            #| beam.FlatMap(parse_record, filtered)
            | beam.ParDo(ParseRecordDoFn())
            | beam.WindowInto(window.FixedWindows(15, 0))
            | beam.CombinePerKey(sum))

        output | beam.io.WriteStringsToPubSub(known_args.output_topic)
        #output | WriteToText(known_args.output)
    print start_time
Beispiel #27
0
def run(bootstrap_servers, topic, pipeline_args):
    # bootstrap_servers = '123.45.67.89:123:9092'
    # topic = 'kafka_taxirides_realtime'
    # pipeline_args = ['--project', 'my-project',
    #                  '--runner', 'DataflowRunner',
    #                  '--temp_location', 'my-temp-location',
    #                  '--region', 'my-region',
    #                  '--num_workers', 'my-num-workers',
    #                  '--experiments', 'use_runner_v2']

    pipeline_options = PipelineOptions(pipeline_args,
                                       save_main_session=True,
                                       streaming=True)
    window_size = 15  # size of the Window in seconds.

    def log_ride(ride_bytes):
        # Converting bytes record from Kafka to a dictionary.
        import ast
        ride = ast.literal_eval(ride_bytes.decode("UTF-8"))
        logging.info(
            'Found ride at latitude %r and longitude %r with %r '
            'passengers', ride['latitude'], ride['longitude'],
            ride['passenger_count'])

    with beam.Pipeline(options=pipeline_options) as pipeline:
        _ = (
            pipeline
            | beam.io.ReadFromPubSub(
                topic='projects/pubsub-public-data/topics/taxirides-realtime').
            with_output_types(bytes)
            | beam.Map(lambda x: (b'', x)).with_output_types(
                typing.Tuple[bytes,
                             bytes])  # Kafka write transforms expects KVs.
            | beam.WindowInto(beam.window.FixedWindows(window_size))
            | WriteToKafka(
                producer_config={'bootstrap.servers': bootstrap_servers},
                topic=topic))

        _ = (pipeline
             | ReadFromKafka(
                 consumer_config={'bootstrap.servers': bootstrap_servers},
                 topics=[topic])
             | beam.FlatMap(lambda kv: log_ride(kv[1])))
Beispiel #28
0
    def expand(self, pcoll):
        ret = (
            pcoll
            | beam.WindowInto(beam.window.GlobalWindows())

            # First get the initial timing information. This will be used to start
            # the periodic timers which will generate processing time and watermark
            # advancements every `sample_resolution_sec`.
            | 'initial timing' >> PairWithTiming()

            # Next, map every element to the same key so that only a single timer is
            # started for this given ReverseTestStream.
            | 'first key' >> beam.Map(lambda x: (0, x))

            # Next, pass-through each element which will be paired with its timing
            # info in the next step. Also, start the periodic timers. We use timers
            # in this situation to capture watermark advancements that occur when
            # there are no elements being produced upstream.
            | beam.ParDo(
                _TimingEventGenerator(
                    output_tag=self._output_tag,
                    sample_resolution_sec=self._sample_resolution_sec))

            # Next, retrieve the timing information for watermark events that were
            # generated in the previous step. This is because elements generated
            # through the timers don't have their timing information yet.
            | 'timing info for watermarks' >> PairWithTiming()

            # Re-key to the same key to keep global state.
            | 'second key' >> beam.Map(lambda x: (0, x))

            # Format the events properly.
            | beam.ParDo(_TestStreamFormatter(self._coder,
                                              self._output_format)))

        if self._output_format == OutputFormat.SERIALIZED_TEST_STREAM_FILE_RECORDS:

            def serializer(e):
                return e.SerializeToString()

            ret = ret | 'serializer' >> beam.Map(serializer)

        return ret
Beispiel #29
0
def run():
    pipeline_options = PipelineOptions(streaming=True)
    resolution = pipeline_options.view_as(MyOptions).resolution.get()
    with beam.Pipeline(options=pipeline_options) as p:
        subscription_id = 'projects/iex-stream/subscriptions/iex-aggregate-' + str(
            resolution)
        lines = (p | beam.io.ReadFromPubSub(
            subscription=subscription_id).with_output_types(bytes)
                 | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))
                 | beam.Map(json.loads))

        schema = 'symbol:STRING,latest_price:FLOAT,window_end:TIMESTAMP,event_time:TIMESTAMP,resolution_minutes:INTEGER'
        (lines
         | 'CreateWindow' >> beam.WindowInto(
             SlidingWindows(60 * resolution, 10, 5))
         | 'AddWindowEndTimestamp' >> beam.ParDo(
             AddTimestamp(resolution=resolution))
         | 'WriteToBigQuery' >> beam.io.WriteToBigQuery('iex.quote',
                                                        schema=schema))
Beispiel #30
0
def load(events, metadata=None):
    return (
        events
        | nexmark_query_util.JustBids()
        | 'query5_sliding_window' >> beam.WindowInto(
            window.SlidingWindows(metadata.get('window_size_sec'),
                                  metadata.get('window_period_sec')))
        # project out only the auction id for each bid
        | 'extract_bid_auction' >> beam.Map(lambda bid: bid.auction)
        | 'bid_count_per_auction' >> beam.combiners.Count.PerElement()
        | 'bid_max_count' >> beam.CombineGlobally(
            MostBidCombineFn()).without_defaults()
        # TODO(leiyiz): fanout with sliding window produces duplicated results,
        #   uncomment after it is fixed [BEAM-10617]
        # .with_fanout(metadata.get('fanout'))
        | beam.FlatMap(lambda auc_count: [{
            ResultNames.AUCTION_ID: auction,
            ResultNames.NUM: auc_count[1]
        } for auction in auc_count[0]]))