Example #1
0
  def test_gbk_execution(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    records = (p
               | test_stream
               | beam.WindowInto(FixedWindows(15))
               | beam.Map(lambda x: ('k', x))
               | beam.GroupByKey())
    # TODO(BEAM-2519): timestamp assignment for elements from a GBK should
    # respect the TimestampCombiner.  The test below should also verify the
    # timestamps of the outputted elements once this is implemented.
    assert_that(records, equal_to([
        ('k', ['a', 'b', 'c']),
        ('k', ['d', 'e']),
        ('k', ['late']),
        ('k', ['last'])]))
    p.run()
Example #2
0
  def test_setting_timestamp(self):
    p = TestPipeline()
    unkeyed_items = p | beam.Create([12, 30, 60, 61, 66])
    items = (unkeyed_items | 'key' >> beam.Map(lambda x: ('k', x)))

    def extract_timestamp_from_log_entry(entry):
      return entry[1]

    # [START setting_timestamp]
    class AddTimestampDoFn(beam.DoFn):

      def process(self, element):
        # Extract the numeric Unix seconds-since-epoch timestamp to be
        # associated with the current log entry.
        unix_timestamp = extract_timestamp_from_log_entry(element)
        # Wrap and emit the current entry and new timestamp in a
        # TimestampedValue.
        yield beam.window.TimestampedValue(element, unix_timestamp)

    timestamped_items = items | 'timestamp' >> beam.ParDo(AddTimestampDoFn())
    # [END setting_timestamp]
    fixed_windowed_items = (
        timestamped_items | 'window' >> beam.WindowInto(
            beam.window.FixedWindows(60)))
    summed = (fixed_windowed_items
              | 'group' >> beam.GroupByKey()
              | 'combine' >> beam.CombineValues(sum))
    unkeyed = summed | 'unkey' >> beam.Map(lambda x: x[1])
    assert_that(unkeyed, equal_to([42, 187]))
    p.run()
Example #3
0
 def test_timestamped_with_combiners(self):
   p = TestPipeline()
   result = (p
             # Create some initial test values.
             | 'start' >> Create([(k, k) for k in range(10)])
             # The purpose of the WindowInto transform is to establish a
             # FixedWindows windowing function for the PCollection.
             # It does not bucket elements into windows since the timestamps
             # from Create are not spaced 5 ms apart and very likely they all
             # fall into the same window.
             | 'w' >> WindowInto(FixedWindows(5))
             # Generate timestamped values using the values as timestamps.
             # Now there are values 5 ms apart and since Map propagates the
             # windowing function from input to output the output PCollection
             # will have elements falling into different 5ms windows.
             | Map(lambda (x, t): TimestampedValue(x, t))
             # We add a 'key' to each value representing the index of the
             # window. This is important since there is no guarantee of
             # order for the elements of a PCollection.
             | Map(lambda v: (v / 5, v)))
   # Sum all elements associated with a key and window. Although it
   # is called CombinePerKey it is really CombinePerKeyAndWindow the
   # same way GroupByKey is really GroupByKeyAndWindow.
   sum_per_window = result | CombinePerKey(sum)
   # Compute mean per key and window.
   mean_per_window = result | combiners.Mean.PerKey()
   assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]),
               label='assert:sum')
   assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]),
               label='assert:mean')
   p.run()
Example #4
0
def model_multiple_pcollections_flatten(contents, output_path):
  """Merging a PCollection with Flatten."""
  some_hash_fn = lambda s: ord(s[0])
  import apache_beam as beam
  p = TestPipeline()  # Use TestPipeline for testing.
  partition_fn = lambda element, partitions: some_hash_fn(element) % partitions

  # Partition into deciles
  partitioned = p | beam.Create(contents) | beam.Partition(partition_fn, 3)
  pcoll1 = partitioned[0]
  pcoll2 = partitioned[1]
  pcoll3 = partitioned[2]

  # Flatten them back into 1

  # A collection of PCollection objects can be represented simply
  # as a tuple (or list) of PCollections.
  # (The SDK for Python has no separate type to store multiple
  # PCollection objects, whether containing the same or different
  # types.)
  # [START model_multiple_pcollections_flatten]
  merged = (
      (pcoll1, pcoll2, pcoll3)
      # A list of tuples can be "piped" directly into a Flatten transform.
      | beam.Flatten())
  # [END model_multiple_pcollections_flatten]
  merged | beam.io.WriteToText(output_path)

  p.run()
Example #5
0
def model_composite_transform_example(contents, output_path):
  """Example of a composite transform.

  To declare a composite transform, define a subclass of PTransform.

  To override the apply method, define a method "apply" that
  takes a PCollection as its only parameter and returns a PCollection.
  """
  import re

  import apache_beam as beam

  # [START composite_transform_example]
  # [START composite_ptransform_apply_method]
  # [START composite_ptransform_declare]
  class CountWords(beam.PTransform):
    # [END composite_ptransform_declare]

    def expand(self, pcoll):
      return (pcoll
              | beam.FlatMap(lambda x: re.findall(r'\w+', x))
              | beam.combiners.Count.PerElement()
              | beam.Map(lambda (word, c): '%s: %s' % (word, c)))
  # [END composite_ptransform_apply_method]
  # [END composite_transform_example]

  p = TestPipeline()  # Use TestPipeline for testing.
  (p
   | beam.Create(contents)
   | CountWords()
   | beam.io.WriteToText(output_path))
  p.run()
Example #6
0
def pipeline_logging(lines, output):
  """Logging Pipeline Messages."""

  import re
  import apache_beam as beam

  # [START pipeline_logging]
  # import Python logging module.
  import logging

  class ExtractWordsFn(beam.DoFn):

    def process(self, element):
      words = re.findall(r'[A-Za-z\']+', element)
      for word in words:
        yield word

        if word.lower() == 'love':
          # Log using the root logger at info or higher levels
          logging.info('Found : %s', word.lower())

  # Remaining WordCount example code ...
  # [END pipeline_logging]

  p = TestPipeline()  # Use TestPipeline for testing.
  (p
   | beam.Create(lines)
   | beam.ParDo(ExtractWordsFn())
   | beam.io.WriteToText(output))

  p.run()
class StreamingWordCountIT(unittest.TestCase):

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    self.uuid = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pubsub_client = pubsub.Client(project=self.project)
    self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid)
    self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid)
    self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid)
    self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid)

    self.input_topic.create()
    self.output_topic.create()
    test_utils.wait_for_topics_created([self.input_topic, self.output_topic])
    self.input_sub.create()
    self.output_sub.create()

  def _inject_numbers(self, topic, num_messages):
    """Inject numbers as test data to PubSub."""
    logging.debug('Injecting %d numbers to topic %s',
                  num_messages, topic.full_name)
    for n in range(num_messages):
      topic.publish(str(n))

  def _cleanup_pubsub(self):
    test_utils.cleanup_subscriptions([self.input_sub, self.output_sub])
    test_utils.cleanup_topics([self.input_topic, self.output_topic])

  def tearDown(self):
    self._cleanup_pubsub()

  @attr('IT')
  def test_streaming_wordcount_it(self):
    # Build expected dataset.
    expected_msg = [('%d: 1' % num) for num in range(DEFAULT_INPUT_NUMBERS)]

    # Set extra options to the pipeline for test purpose
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
    pubsub_msg_verifier = PubSubMessageMatcher(self.project,
                                               OUTPUT_SUB + self.uuid,
                                               expected_msg,
                                               timeout=400)
    extra_opts = {'input_subscription': self.input_sub.full_name,
                  'output_topic': self.output_topic.full_name,
                  'wait_until_finish_duration': WAIT_UNTIL_FINISH_DURATION,
                  'on_success_matcher': all_of(state_verifier,
                                               pubsub_msg_verifier)}

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_sub])
    self._inject_numbers(self.input_topic, DEFAULT_INPUT_NUMBERS)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    streaming_wordcount.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
Example #8
0
 def test_read_from_text_file_pattern(self):
   pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
   assert len(expected_data) == 40
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(pattern)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
Example #9
0
def model_multiple_pcollections_partition(contents, output_path):
  """Splitting a PCollection with Partition."""
  some_hash_fn = lambda s: ord(s[0])

  def get_percentile(i):
    """Assume i in [0,100)."""
    return i
  import apache_beam as beam
  p = TestPipeline()  # Use TestPipeline for testing.

  students = p | beam.Create(contents)

  # [START model_multiple_pcollections_partition]
  def partition_fn(student, num_partitions):
    return int(get_percentile(student) * num_partitions / 100)

  by_decile = students | beam.Partition(partition_fn, 10)
  # [END model_multiple_pcollections_partition]
  # [START model_multiple_pcollections_partition_40th]
  fortieth_percentile = by_decile[4]
  # [END model_multiple_pcollections_partition_40th]

  ([by_decile[d] for d in xrange(10) if d != 4] + [fortieth_percentile]
   | beam.Flatten()
   | beam.io.WriteToText(output_path))

  p.run()
  def test_run_direct(self):
    file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
    pipeline = TestPipeline()
    pcoll = pipeline | beam.io.Read(LineSource(file_name))
    assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

    pipeline.run()
Example #11
0
  def test_basic_execution_sideinputs(self):
    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)

    main_stream = (p
                   | 'main TestStream' >> TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['e']))
    side_stream = (p
                   | 'side TestStream' >> TestStream()
                   .add_elements([window.TimestampedValue(2, 2)])
                   .add_elements([window.TimestampedValue(1, 1)])
                   .add_elements([window.TimestampedValue(7, 7)])
                   .add_elements([window.TimestampedValue(4, 4)])
                  )

    class RecordFn(beam.DoFn):
      def process(self,
                  elm=beam.DoFn.ElementParam,
                  ts=beam.DoFn.TimestampParam,
                  side=beam.DoFn.SideInputParam):
        yield (elm, ts, side)

    records = (main_stream        # pylint: disable=unused-variable
               | beam.ParDo(RecordFn(), beam.pvalue.AsList(side_stream)))

    assert_that(records, equal_to([('e', Timestamp(10), [2, 1, 7, 4])]))

    p.run()
  def test_bigquery_tornadoes_it(self):
    test_pipeline = TestPipeline(is_integration_test=True)

    # Set extra options to the pipeline for test purpose
    project = test_pipeline.get_option('project')

    dataset = 'BigQueryTornadoesIT'
    table = 'monthly_tornadoes_%s' % int(round(time.time() * 1000))
    output_table = '.'.join([dataset, table])
    query = 'SELECT month, tornado_count FROM `%s`' % output_table

    pipeline_verifiers = [PipelineStateMatcher(),
                          BigqueryMatcher(
                              project=project,
                              query=query,
                              checksum=self.DEFAULT_CHECKSUM)]
    extra_opts = {'output': output_table,
                  'on_success_matcher': all_of(*pipeline_verifiers)}

    # Register cleanup before pipeline execution.
    # Note that actual execution happens in reverse order.
    self.addCleanup(utils.delete_bq_table, project, dataset, table)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    bigquery_tornadoes.run(
        test_pipeline.get_full_options_as_args(**extra_opts))
Example #13
0
  def test_read_messages_timestamp_attribute_milli_success(self, mock_pubsub):
    data = b'data'
    attributes = {'time': '1337'}
    publish_time_secs = 1520861821
    publish_time_nanos = 234567000
    ack_id = 'ack_id'
    pull_response = test_utils.create_pull_response([
        test_utils.PullResponseMessage(
            data, attributes, publish_time_secs, publish_time_nanos, ack_id)
    ])
    expected_elements = [
        TestWindowedValue(
            PubsubMessage(data, attributes),
            timestamp.Timestamp(micros=int(attributes['time']) * 1000),
            [window.GlobalWindow()]),
    ]
    mock_pubsub.return_value.pull.return_value = pull_response

    options = PipelineOptions([])
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    pcoll = (p
             | ReadFromPubSub(
                 'projects/fakeprj/topics/a_topic', None, None,
                 with_attributes=True, timestamp_attribute='time'))
    assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
    p.run()
    mock_pubsub.return_value.acknowledge.assert_has_calls([
        mock.call(mock.ANY, [ack_id])])
Example #14
0
def model_co_group_by_key_tuple(email_list, phone_list, output_path):
  """Applying a CoGroupByKey Transform to a tuple."""
  import apache_beam as beam
  p = TestPipeline()  # Use TestPipeline for testing.
  # [START model_group_by_key_cogroupbykey_tuple]
  # Each data set is represented by key-value pairs in separate PCollections.
  # Both data sets share a common key type (in this example str).
  # The email_list contains values such as: ('joe', '*****@*****.**') with
  # multiple possible values for each key.
  # The phone_list contains values such as: ('mary': '111-222-3333') with
  # multiple possible values for each key.
  emails = p | 'email' >> beam.Create(email_list)
  phones = p | 'phone' >> beam.Create(phone_list)
  # The result PCollection contains one key-value element for each key in the
  # input PCollections. The key of the pair will be the key from the input and
  # the value will be a dictionary with two entries: 'emails' - an iterable of
  # all values for the current key in the emails PCollection and 'phones': an
  # iterable of all values for the current key in the phones PCollection.
  # For instance, if 'emails' contained ('joe', '*****@*****.**') and
  # ('joe', '*****@*****.**'), then 'result' will contain the element
  # ('joe', {'emails': ['*****@*****.**', '*****@*****.**'], 'phones': ...})
  result = {'emails': emails, 'phones': phones} | beam.CoGroupByKey()

  def join_info((name, info)):
    return '; '.join(['%s' % name,
                      '%s' % ','.join(info['emails']),
                      '%s' % ','.join(info['phones'])])

  contact_lines = result | beam.Map(join_info)
  # [END model_group_by_key_cogroupbykey_tuple]
  contact_lines | beam.io.WriteToText(output_path)
  p.run()
  def test_read_messages_timestamp_attribute_rfc3339_success(self, mock_pubsub):
    data = 'data'
    message_id = 'message_id'
    attributes = {'time': '2018-03-12T13:37:01.234567Z'}
    publish_time = '2018-03-12T13:37:01.234567Z'
    payloads = [
        create_client_message(data, message_id, attributes, publish_time)]
    expected_elements = [
        TestWindowedValue(
            PubsubMessage(data, attributes),
            timestamp.Timestamp.from_rfc3339(attributes['time']),
            [window.GlobalWindow()]),
    ]

    mock_pubsub.Client = functools.partial(FakePubsubClient, payloads)
    mock_pubsub.subscription.AutoAck = FakeAutoAck

    p = TestPipeline()
    p.options.view_as(StandardOptions).streaming = True
    pcoll = (p
             | ReadFromPubSub(
                 'projects/fakeprj/topics/a_topic', None, 'a_label',
                 with_attributes=True, timestamp_attribute='time'))
    assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
    p.run()
Example #16
0
  def test_on_direct_runner(self):
    class FakeSink(NativeSink):
      """A fake sink outputing a number of elements."""

      def __init__(self):
        self.written_values = []
        self.writer_instance = FakeSinkWriter(self.written_values)

      def writer(self):
        return self.writer_instance

    class FakeSinkWriter(NativeSinkWriter):
      """A fake sink writer for testing."""

      def __init__(self, written_values):
        self.written_values = written_values

      def __enter__(self):
        return self

      def __exit__(self, *unused_args):
        pass

      def Write(self, value):
        self.written_values.append(value)

    p = TestPipeline()
    sink = FakeSink()
    p | Create(['a', 'b', 'c']) | _NativeWrite(sink)  # pylint: disable=expression-not-assigned
    p.run()

    self.assertEqual(['a', 'b', 'c'], sink.written_values)
  def test_basic_execution(self):
    test_stream = (TestStream()
                   .advance_watermark_to(10)
                   .add_elements(['a', 'b', 'c'])
                   .advance_watermark_to(20)
                   .add_elements(['d'])
                   .add_elements(['e'])
                   .advance_processing_time(10)
                   .advance_watermark_to(300)
                   .add_elements([TimestampedValue('late', 12)])
                   .add_elements([TimestampedValue('last', 310)]))

    class RecordFn(beam.DoFn):
      def process(self, element=beam.DoFn.ElementParam,
                  timestamp=beam.DoFn.TimestampParam):
        yield (element, timestamp)

    options = PipelineOptions()
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    my_record_fn = RecordFn()
    records = p | test_stream | beam.ParDo(my_record_fn)
    assert_that(records, equal_to([
        ('a', timestamp.Timestamp(10)),
        ('b', timestamp.Timestamp(10)),
        ('c', timestamp.Timestamp(10)),
        ('d', timestamp.Timestamp(20)),
        ('e', timestamp.Timestamp(20)),
        ('late', timestamp.Timestamp(12)),
        ('last', timestamp.Timestamp(310)),]))
    p.run()
Example #18
0
  def test_no_window_context_fails(self):
    expected_timestamp = timestamp.Timestamp(5)
    # Assuming the default window function is window.GlobalWindows.
    expected_window = window.GlobalWindow()

    class AddTimestampDoFn(beam.DoFn):
      def process(self, element):
        yield window.TimestampedValue(element, expected_timestamp)

    pipeline = TestPipeline()
    data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
    expected_windows = [
        TestWindowedValue(kv, expected_timestamp, [expected_window])
        for kv in data]
    before_identity = (pipeline
                       | 'start' >> beam.Create(data)
                       | 'add_timestamps' >> beam.ParDo(AddTimestampDoFn()))
    assert_that(before_identity, equal_to(expected_windows),
                label='before_identity', reify_windows=True)
    after_identity = (before_identity
                      | 'window' >> beam.WindowInto(
                          beam.transforms.util._IdentityWindowFn(
                              coders.GlobalWindowCoder()))
                      # This DoFn will return TimestampedValues, making
                      # WindowFn.AssignContext passed to IdentityWindowFn
                      # contain a window of None. IdentityWindowFn should
                      # raise an exception.
                      | 'add_timestamps2' >> beam.ParDo(AddTimestampDoFn()))
    assert_that(after_identity, equal_to(expected_windows),
                label='after_identity', reify_windows=True)
    with self.assertRaisesRegexp(ValueError, r'window.*None.*add_timestamps2'):
      pipeline.run()
Example #19
0
  def test_window_preserved(self):
    expected_timestamp = timestamp.Timestamp(5)
    expected_window = window.IntervalWindow(1.0, 2.0)

    class AddWindowDoFn(beam.DoFn):
      def process(self, element):
        yield WindowedValue(
            element, expected_timestamp, [expected_window])

    pipeline = TestPipeline()
    data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
    expected_windows = [
        TestWindowedValue(kv, expected_timestamp, [expected_window])
        for kv in data]
    before_identity = (pipeline
                       | 'start' >> beam.Create(data)
                       | 'add_windows' >> beam.ParDo(AddWindowDoFn()))
    assert_that(before_identity, equal_to(expected_windows),
                label='before_identity', reify_windows=True)
    after_identity = (before_identity
                      | 'window' >> beam.WindowInto(
                          beam.transforms.util._IdentityWindowFn(
                              coders.IntervalWindowCoder())))
    assert_that(after_identity, equal_to(expected_windows),
                label='after_identity', reify_windows=True)
    pipeline.run()
Example #20
0
 def test_reshuffle_window_fn_preserved(self):
   pipeline = TestPipeline()
   data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 4)]
   expected_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [
       ((1, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((2, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((3, 1), 1.0, IntervalWindow(1.0, 3.0)),
       ((1, 2), 2.0, IntervalWindow(2.0, 4.0)),
       ((2, 2), 2.0, IntervalWindow(2.0, 4.0)),
       ((1, 4), 4.0, IntervalWindow(4.0, 6.0))]]
   expected_merged_windows = [TestWindowedValue(v, t, [w]) for (v, t, w) in [
       ((1, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)),
       ((2, contains_in_any_order([2, 1])), 4.0, IntervalWindow(1.0, 4.0)),
       ((3, [1]), 3.0, IntervalWindow(1.0, 3.0)),
       ((1, [4]), 6.0, IntervalWindow(4.0, 6.0))]]
   before_reshuffle = (pipeline
                       | 'start' >> beam.Create(data)
                       | 'add_timestamp' >> beam.Map(
                           lambda v: TimestampedValue(v, v[1]))
                       | 'window' >> beam.WindowInto(Sessions(gap_size=2)))
   assert_that(before_reshuffle, equal_to(expected_windows),
               label='before_reshuffle', reify_windows=True)
   after_reshuffle = (before_reshuffle
                      | 'reshuffle' >> beam.Reshuffle())
   assert_that(after_reshuffle, equal_to(expected_windows),
               label='after_reshuffle', reify_windows=True)
   after_group = (after_reshuffle
                  | 'group_by_key' >> beam.GroupByKey())
   assert_that(after_group, equal_to(expected_merged_windows),
               label='after_group', reify_windows=True)
   pipeline.run()
def run_bq_pipeline(argv=None):
  """Run the sample BigQuery pipeline.

  Args:
    argv: Arguments to the run function.
  """
  parser = argparse.ArgumentParser()
  parser.add_argument('--query', required=True,
                      help='Query to process for the table.')
  parser.add_argument('--output', required=True,
                      help='Output BQ table to write results to.')
  parser.add_argument('--output_schema', dest='output_schema', required=True,
                      help='Schema for output BQ table.')
  parser.add_argument('--use_standard_sql', action='store_true',
                      dest='use_standard_sql',
                      help='Output BQ table to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  table_schema = parse_table_schema_from_json(known_args.output_schema)

  p = TestPipeline(options=PipelineOptions(pipeline_args))

  # pylint: disable=expression-not-assigned
  # pylint: disable=bad-continuation
  (p | 'read' >> beam.io.Read(beam.io.BigQuerySource(
      query=known_args.query, use_standard_sql=known_args.use_standard_sql))
   | 'write' >> beam.io.Write(beam.io.BigQuerySink(
           known_args.output,
           schema=table_schema,
           create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
           write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))

  result = p.run()
  result.wait_until_finish()
Example #22
0
 def test_runtime_checks_on(self):
   # pylint: disable=expression-not-assigned
   p = TestPipeline(options=PipelineOptions(runtime_type_check=True))
   with self.assertRaises(typehints.TypeCheckError):
     # [START type_hints_runtime_on]
     p | beam.Create(['a']) | beam.Map(lambda x: 3).with_output_types(str)
     p.run()
 def run_pipeline(self, count_implementation, factor=1):
   p = TestPipeline()
   words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG'])
   result = words | count_implementation
   assert_that(
       result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))]))
   p.run()
class UserScoreIT(unittest.TestCase):

  DEFAULT_INPUT_FILE = 'gs://dataflow-samples/game/gaming_data*'
  DEFAULT_EXPECTED_CHECKSUM = '9f3bd81669607f0d98ec80ddd477f3277cfba0a2'

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.uuid = str(uuid.uuid4())

    self.output = '/'.join([self.test_pipeline.get_option('output'),
                            self.uuid,
                            'results'])

  @attr('IT')
  def test_user_score_it(self):

    state_verifier = PipelineStateMatcher(PipelineState.DONE)
    file_verifier = FileChecksumMatcher(self.output + '*-of-*',
                                        self.DEFAULT_EXPECTED_CHECKSUM)

    extra_opts = {'input': self.DEFAULT_INPUT_FILE,
                  'output': self.output + '/user-score',
                  'on_success_matcher': all_of(state_verifier,
                                               file_verifier)}

    # Register clean up before pipeline execution
    self.addCleanup(delete_files, [self.output + '*'])

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    user_score.run(
        self.test_pipeline.get_full_options_as_args(**extra_opts))
  def test_compute_top_sessions(self):
    p = TestPipeline()
    edits = p | beam.Create(self.EDITS)
    result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0)

    assert_that(result, equal_to(self.EXPECTED))
    p.run()
  def test_read_messages_timestamp_attribute_missing(self, mock_pubsub):
    data = 'data'
    attributes = {}
    publish_time_secs = 1520861821
    publish_time_nanos = 234567000
    publish_time = '2018-03-12T13:37:01.234567Z'
    ack_id = 'ack_id'
    pull_response = test_utils.create_pull_response([
        test_utils.PullResponseMessage(
            data, attributes, publish_time_secs, publish_time_nanos, ack_id)
    ])
    expected_elements = [
        TestWindowedValue(
            PubsubMessage(data, attributes),
            timestamp.Timestamp.from_rfc3339(publish_time),
            [window.GlobalWindow()]),
    ]
    mock_pubsub.return_value.pull.return_value = pull_response

    p = TestPipeline()
    p.options.view_as(StandardOptions).streaming = True
    pcoll = (p
             | ReadFromPubSub(
                 'projects/fakeprj/topics/a_topic', None, None,
                 with_attributes=True, timestamp_attribute='nonexistent'))
    assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
    p.run()
    mock_pubsub.return_value.acknowledge.assert_has_calls([
        mock.call(mock.ANY, [ack_id])])
Example #27
0
  def test_ptransform_override_type_hints(self):

    class NoTypeHintOverride(PTransformOverride):

      def matches(self, applied_ptransform):
        return isinstance(applied_ptransform.transform, DoubleParDo)

      def get_replacement_transform(self, ptransform):
        return ToStringParDo()

    class WithTypeHintOverride(PTransformOverride):

      def matches(self, applied_ptransform):
        return isinstance(applied_ptransform.transform, DoubleParDo)

      def get_replacement_transform(self, ptransform):
        return (ToStringParDo()
                .with_input_types(int)
                .with_output_types(str))

    for override, expected_type in [(NoTypeHintOverride(), typehints.Any),
                                    (WithTypeHintOverride(), str)]:
      p = TestPipeline()
      pcoll = (p
               | beam.Create([1, 2, 3])
               | 'Operate' >> DoubleParDo()
               | 'NoOp' >> beam.Map(lambda x: x))

      p.replace_all([override])
      self.assertEquals(pcoll.producer.inputs[0].element_type, expected_type)
Example #28
0
 def test_read_from_text_single_file(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(file_name)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
Example #29
0
  def test_model_use_and_query_metrics(self):
    """DebuggingWordCount example snippets."""

    import re

    p = TestPipeline()  # Use TestPipeline for testing.
    words = p | beam.Create(['albert', 'sam', 'mark', 'sarah',
                             'swati', 'daniel', 'andrea'])

    # pylint: disable=unused-variable
    # [START metrics_usage_example]
    class FilterTextFn(beam.DoFn):
      """A DoFn that filters for a specific key based on a regex."""

      def __init__(self, pattern):
        self.pattern = pattern
        # A custom metric can track values in your pipeline as it runs. Create
        # custom metrics to count unmatched words, and know the distribution of
        # word lengths in the input PCollection.
        self.word_len_dist = Metrics.distribution(self.__class__,
                                                  'word_len_dist')
        self.unmatched_words = Metrics.counter(self.__class__,
                                               'unmatched_words')

      def process(self, element):
        word = element
        self.word_len_dist.update(len(word))
        if re.match(self.pattern, word):
          yield element
        else:
          self.unmatched_words.inc()

    filtered_words = (
        words | 'FilterText' >> beam.ParDo(FilterTextFn('s.*')))
    # [END metrics_usage_example]
    # pylint: enable=unused-variable

    # [START metrics_check_values_example]
    result = p.run()
    result.wait_until_finish()

    custom_distribution = result.metrics().query(
        MetricsFilter().with_name('word_len_dist'))['distributions']
    custom_counter = result.metrics().query(
        MetricsFilter().with_name('unmatched_words'))['counters']

    if custom_distribution:
      logging.info('The average word length was %d',
                   custom_distribution[0].committed.mean)
    if custom_counter:
      logging.info('There were %d words that did not match the filter.',
                   custom_counter[0].committed)
    # [END metrics_check_values_example]

    # There should be 4 words that did not match
    self.assertEqual(custom_counter[0].committed, 4)
    # The shortest word is 3 characters, the longest is 6
    self.assertEqual(custom_distribution[0].committed.min, 3)
    self.assertEqual(custom_distribution[0].committed.max, 6)
Example #30
0
 def test_read_all_single_file(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   pipeline = TestPipeline()
   pcoll = pipeline | 'Create' >> Create(
       [file_name]) |'ReadAll' >> ReadAllFromText()
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
Example #31
0
 def setUpClass(cls):
     cls.test_pipeline = TestPipeline(is_integration_test=True)
     cls.args = cls.test_pipeline.get_full_options_as_args()
     cls.runner_name = type(cls.test_pipeline.runner).__name__
     cls.project = cls.test_pipeline.get_option('project')
Example #32
0
 def test_create_singleton_pcollection(self):
     pipeline = TestPipeline()
     pcoll = pipeline | 'label' >> Create([[1, 2, 3]])
     assert_that(pcoll, equal_to([[1, 2, 3]]))
     pipeline.run()
Example #33
0
 def test_match_group_name_pattern(self):
     with TestPipeline() as p:
         rc = re.compile("x (?P<namedgroup>[xyz]*)")
         result = (p | beam.Create(["a", "x xxx", "x yyy", "x zzz"])
                   | util.Regex.matches(rc, 'namedgroup'))
         assert_that(result, equal_to(("xxx", "yyy", "zzz")))
Example #34
0
 def test_match_group_kv_none(self):
     with TestPipeline() as p:
         result = (p | beam.Create(["x y z"])
                   | util.Regex.matches_kv("a (b) (c)", 1, 2))
         assert_that(result, equal_to([]))
Example #35
0
 def test_match_kv_group_name_none(self):
     with TestPipeline() as p:
         result = (p | beam.Create(["x y z"]) | util.Regex.matches_kv(
             "a (?P<keyname>b) (?P<valuename>c)", 'keyname', 'valuename'))
         assert_that(result, equal_to([]))
Example #36
0
 def test_replace_first_mixed(self):
     with TestPipeline() as p:
         result = (p | beam.Create(["abc", "xjx", "yjy", "zjz", "def"])
                   | util.Regex.replace_first("[xyz]", 'new'))
         assert_that(result,
                     equal_to(["abc", "newjx", "newjy", "newjz", "def"]))
Example #37
0
    def setUp(self):
        parser = argparse.ArgumentParser()

        parser.add_argument(
            '--aws_kinesis_stream',
            default='beam_kinesis_xlang',
            help='Kinesis stream name',
        )
        parser.add_argument(
            '--aws_access_key',
            default='accesskey',
            help=('Aws access key'),
        )
        parser.add_argument(
            '--aws_secret_key',
            default='secretkey',
            help='Aws secret key',
        )
        parser.add_argument(
            '--aws_region',
            default='us-east-1',
            help='Aws region',
        )
        parser.add_argument(
            '--aws_service_endpoint',
            default=None,
            help='Url to external aws endpoint',
        )
        parser.add_argument(
            '--use_real_aws',
            default=False,
            dest='use_real_aws',
            action='store_true',
            help='Flag whether to use real aws for the tests purpose',
        )
        parser.add_argument(
            '--expansion_service',
            help='Url to externally launched expansion service.',
        )

        pipeline = TestPipeline()
        argv = pipeline.get_full_options_as_args()

        known_args, self.pipeline_args = parser.parse_known_args(argv)

        self.aws_kinesis_stream = known_args.aws_kinesis_stream
        self.aws_access_key = known_args.aws_access_key
        self.aws_secret_key = known_args.aws_secret_key
        self.aws_region = known_args.aws_region
        self.aws_service_endpoint = known_args.aws_service_endpoint
        self.use_localstack = not known_args.use_real_aws
        self.expansion_service = known_args.expansion_service
        self.producer_properties = {
            'CollectionMaxCount': str(NUM_RECORDS),
            'ConnectTimeout': str(MAX_READ_TIME),
        }

        if self.use_localstack:
            self.set_localstack()

        self.kinesis_helper = KinesisHelper(
            self.aws_access_key,
            self.aws_secret_key,
            self.aws_region,
            self.aws_service_endpoint.replace('https', 'http')
            if self.aws_service_endpoint else None,
        )

        if self.use_localstack:
            self.kinesis_helper.create_stream(self.aws_kinesis_stream)
Example #38
0
 def test_tostring_iterables(self):
     with TestPipeline() as p:
         result = (p | beam.Create([("one", "two", "three"),
                                    ("four", "five", "six")])
                   | util.ToString.Iterables())
         assert_that(result, equal_to(["one,two,three", "four,five,six"]))
Example #39
0
    def test_tostring_elements(self):

        with TestPipeline() as p:
            result = (p | beam.Create([1, 1, 2, 3]) | util.ToString.Element())
            assert_that(result, equal_to(["1", "1", "2", "3"]))
Example #40
0
 def test_match_group(self):
     with TestPipeline() as p:
         result = (p | beam.Create(["a", "x xxx", "x yyy", "x zzz"])
                   | util.Regex.matches("x ([xyz]*)", 1))
         assert_that(result, equal_to(("xxx", "yyy", "zzz")))
Example #41
0
 def test_pipeline_read_file_pattern_large(self):
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromVcf(
         os.path.join(get_full_dir(), 'valid-*.vcf'))
     assert_that(pcoll, _count_equals_to(9900))
     pipeline.run()
Example #42
0
 def test_timestamp_param_map(self):
     with TestPipeline() as p:
         assert_that(
             p | Create([1, 2])
             | beam.Map(lambda _, t=DoFn.TimestampParam: t),
             equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
Example #43
0
    def test_multiple_outputs_with_watermark_advancement(self):
        """Tests that the TestStream can independently control output watermarks."""

        # Purposely set the watermark of numbers to 20 then letters to 5 to test
        # that the watermark advancement is per PCollection.
        #
        # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be
        # emitted at different times so that they will have different windows. The
        # watermark advancement is checked by checking their windows. If the
        # watermark does not advance, then the windows will be [-inf, -inf). If the
        # windows do not advance separately, then the PCollections will both
        # windowed in [15, 30).
        letters_elements = [
            TimestampedValue('a', 6),
            TimestampedValue('b', 7),
            TimestampedValue('c', 8),
        ]
        numbers_elements = [
            TimestampedValue('1', 21),
            TimestampedValue('2', 22),
            TimestampedValue('3', 23),
        ]
        test_stream = (TestStream().advance_watermark_to(
            0, tag='letters').advance_watermark_to(
                0, tag='numbers').advance_watermark_to(
                    20, tag='numbers').advance_watermark_to(
                        5, tag='letters').add_elements(
                            letters_elements,
                            tag='letters').advance_watermark_to(
                                10, tag='letters').add_elements(
                                    numbers_elements,
                                    tag='numbers').advance_watermark_to(
                                        30, tag='numbers'))

        options = StandardOptions(streaming=True)
        p = TestPipeline(is_integration_test=True, options=options)

        main = p | test_stream

        # Use an AfterWatermark trigger with an early firing to test that the
        # watermark is advancing properly and that the element is being emitted in
        # the correct window.
        letters = (
            main['letters']
            | 'letter windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'letter with key' >> beam.Map(lambda x: ('k', x))
            | 'letter gbk' >> beam.GroupByKey())

        numbers = (
            main['numbers']
            | 'number windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'number with key' >> beam.Map(lambda x: ('k', x))
            | 'number gbk' >> beam.GroupByKey())

        # The letters were emitted when the watermark was at 5, thus we expect to
        # see the elements in the [0, 15) window. We used an early trigger to make
        # sure that the ON_TIME empty pane was also emitted with a TestStream.
        # This pane has no data because of the early trigger causes the elements to
        # fire before the end of the window and because the accumulation mode
        # discards any data after the trigger fired.
        expected_letters = {
            window.IntervalWindow(0, 15): [
                ('k', ['a', 'b', 'c']),
                ('k', []),
            ],
        }

        # Same here, except the numbers were emitted at watermark = 20, thus they
        # are in the [15, 30) window.
        expected_numbers = {
            window.IntervalWindow(15, 30): [
                ('k', ['1', '2', '3']),
                ('k', []),
            ],
        }
        assert_that(letters,
                    equal_to_per_window(expected_letters),
                    label='letters assert per window')
        assert_that(numbers,
                    equal_to_per_window(expected_numbers),
                    label='numbers assert per window')

        p.run()
Example #44
0
 def test_tostring_iterables_with_delimeter(self):
     with TestPipeline() as p:
         data = [("one", "two", "three"), ("four", "five", "six")]
         result = (p | beam.Create(data) | util.ToString.Iterables("\t"))
         assert_that(result,
                     equal_to(["one\ttwo\tthree", "four\tfive\tsix"]))
Example #45
0
class CrossLanguageKinesisIOTest(unittest.TestCase):
    @unittest.skipUnless(
        TestPipeline().get_option('aws_kinesis_stream'),
        'Cannot test on real aws without pipeline options provided')
    def test_kinesis_io_roundtrip(self):
        # TODO: enable this test for localstack once
        # https://github.com/apache/beam/issues/20416 is resolved
        self.run_kinesis_write()
        self.run_kinesis_read()

    @unittest.skipIf(
        TestPipeline().get_option('aws_kinesis_stream'),
        'Do not test on localstack when pipeline options were provided')
    def test_kinesis_write(self):
        # TODO: remove this test once
        # https://github.com/apache/beam/issues/20416 is resolved
        self.run_kinesis_write()
        records = self.kinesis_helper.read_from_stream(self.aws_kinesis_stream)
        self.assertEqual(
            sorted(records),
            sorted([RECORD + str(i).encode() for i in range(NUM_RECORDS)]))

    def run_kinesis_write(self):
        with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p:
            p.not_use_test_runner_api = True
            _ = (
                p
                | 'Impulse' >> beam.Impulse()
                | 'Generate' >> beam.FlatMap(lambda x: range(NUM_RECORDS))  # pylint: disable=bad-option-value
                | 'Map to bytes' >> beam.Map(lambda x: RECORD + str(x).encode(
                )).with_output_types(bytes)
                | 'WriteToKinesis' >> WriteToKinesis(
                    stream_name=self.aws_kinesis_stream,
                    aws_access_key=self.aws_access_key,
                    aws_secret_key=self.aws_secret_key,
                    region=self.aws_region,
                    service_endpoint=self.aws_service_endpoint,
                    verify_certificate=(not self.use_localstack),
                    partition_key='1',
                    producer_properties=self.producer_properties,
                ))

    def run_kinesis_read(self):
        records = [RECORD + str(i).encode() for i in range(NUM_RECORDS)]

        with TestPipeline(options=PipelineOptions(self.pipeline_args)) as p:
            result = (p
                      | 'ReadFromKinesis' >> ReadDataFromKinesis(
                          stream_name=self.aws_kinesis_stream,
                          aws_access_key=self.aws_access_key,
                          aws_secret_key=self.aws_secret_key,
                          region=self.aws_region,
                          service_endpoint=self.aws_service_endpoint,
                          verify_certificate=not self.use_localstack,
                          max_num_records=NUM_RECORDS,
                          max_read_time=MAX_READ_TIME,
                          request_records_limit=REQUEST_RECORDS_LIMIT,
                          watermark_policy=WatermarkPolicy.ARRIVAL_TIME,
                          watermark_idle_duration_threshold=MAX_READ_TIME,
                          initial_position_in_stream=InitialPositionInStream.
                          AT_TIMESTAMP,
                          initial_timestamp_in_stream=NOW_MILLIS,
                      ).with_output_types(bytes))
            assert_that(result, equal_to(records))

    def set_localstack(self):
        self.localstack = DockerContainer('localstack/localstack:{}'
                                          .format(LOCALSTACK_VERSION))\
          .with_env('SERVICES', 'kinesis')\
          .with_env('KINESIS_PORT', '4568')\
          .with_env('USE_SSL', 'true')\
          .with_exposed_ports(4568)\
          .with_volume_mapping('/var/run/docker.sock', '/var/run/docker.sock', 'rw')

        # Repeat if ReadTimeout is raised.
        for i in range(4):
            try:
                self.localstack.start()
                break
            except Exception as e:  # pylint: disable=bare-except
                if i == 3:
                    logging.error('Could not initialize localstack container')
                    raise e

        self.aws_service_endpoint = 'https://{}:{}'.format(
            self.localstack.get_container_host_ip(),
            self.localstack.get_exposed_port('4568'),
        )

    def setUp(self):
        parser = argparse.ArgumentParser()

        parser.add_argument(
            '--aws_kinesis_stream',
            default='beam_kinesis_xlang',
            help='Kinesis stream name',
        )
        parser.add_argument(
            '--aws_access_key',
            default='accesskey',
            help=('Aws access key'),
        )
        parser.add_argument(
            '--aws_secret_key',
            default='secretkey',
            help='Aws secret key',
        )
        parser.add_argument(
            '--aws_region',
            default='us-east-1',
            help='Aws region',
        )
        parser.add_argument(
            '--aws_service_endpoint',
            default=None,
            help='Url to external aws endpoint',
        )
        parser.add_argument(
            '--use_real_aws',
            default=False,
            dest='use_real_aws',
            action='store_true',
            help='Flag whether to use real aws for the tests purpose',
        )
        parser.add_argument(
            '--expansion_service',
            help='Url to externally launched expansion service.',
        )

        pipeline = TestPipeline()
        argv = pipeline.get_full_options_as_args()

        known_args, self.pipeline_args = parser.parse_known_args(argv)

        self.aws_kinesis_stream = known_args.aws_kinesis_stream
        self.aws_access_key = known_args.aws_access_key
        self.aws_secret_key = known_args.aws_secret_key
        self.aws_region = known_args.aws_region
        self.aws_service_endpoint = known_args.aws_service_endpoint
        self.use_localstack = not known_args.use_real_aws
        self.expansion_service = known_args.expansion_service
        self.producer_properties = {
            'CollectionMaxCount': str(NUM_RECORDS),
            'ConnectTimeout': str(MAX_READ_TIME),
        }

        if self.use_localstack:
            self.set_localstack()

        self.kinesis_helper = KinesisHelper(
            self.aws_access_key,
            self.aws_secret_key,
            self.aws_region,
            self.aws_service_endpoint.replace('https', 'http')
            if self.aws_service_endpoint else None,
        )

        if self.use_localstack:
            self.kinesis_helper.create_stream(self.aws_kinesis_stream)

    def tearDown(self):
        if self.use_localstack:
            self.kinesis_helper.delete_stream(self.aws_kinesis_stream)

            try:
                self.localstack.stop()
            except:  # pylint: disable=bare-except
                logging.error('Could not stop the localstack container')
Example #46
0
 def test_tostring_kvs(self):
     with TestPipeline() as p:
         result = (p | beam.Create([("one", 1),
                                    ("two", 2)]) | util.ToString.Kvs())
         assert_that(result, equal_to(["one,1", "two,2"]))
Example #47
0
    DockerContainer = None
# pylint: enable=wrong-import-order, wrong-import-position, ungrouped-imports

LOCALSTACK_VERSION = '0.11.3'
NUM_RECORDS = 10
MAX_READ_TIME = 5 * 60 * 1000  # 5min
NOW_SECONDS = time.time()
NOW_MILLIS = NOW_SECONDS * 1000
REQUEST_RECORDS_LIMIT = 1000
RECORD = b'record' + str(uuid.uuid4()).encode()


@unittest.skipUnless(DockerContainer, 'testcontainers is not installed.')
@unittest.skipUnless(boto3, 'boto3 is not installed.')
@unittest.skipUnless(
    TestPipeline().get_pipeline_options().view_as(StandardOptions).runner,
    'Do not run this test on precommit suites.')
class CrossLanguageKinesisIOTest(unittest.TestCase):
    @unittest.skipUnless(
        TestPipeline().get_option('aws_kinesis_stream'),
        'Cannot test on real aws without pipeline options provided')
    def test_kinesis_io_roundtrip(self):
        # TODO: enable this test for localstack once
        # https://github.com/apache/beam/issues/20416 is resolved
        self.run_kinesis_write()
        self.run_kinesis_read()

    @unittest.skipIf(
        TestPipeline().get_option('aws_kinesis_stream'),
        'Do not test on localstack when pipeline options were provided')
    def test_kinesis_write(self):
Example #48
0
 def test_tostring_kvs_delimeter(self):
     with TestPipeline() as p:
         result = (p | beam.Create([("one", 1),
                                    ("two", 2)]) | util.ToString.Kvs("\t"))
         assert_that(result, equal_to(["one\t1", "two\t2"]))
Example #49
0
 def test_replace_all(self):
     with TestPipeline() as p:
         result = (p | beam.Create(["xj", "yj", "zj"])
                   | util.Regex.replace_all("[xyz]", "new"))
         assert_that(result, equal_to(["newj", "newj", "newj"]))
Example #50
0
 def test_pipeline_read_single_file_large(self):
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromVcf(
         get_full_file_path('valid-4.0.vcf'))
     assert_that(pcoll, _count_equals_to(5))
     pipeline.run()
Example #51
0
 def test_match_kv_group_names_pattern(self):
     with TestPipeline() as p:
         rc = re.compile("a (?P<keyname>b) (?P<valuename>c)")
         result = (p | beam.Create(["a b c"])
                   | util.Regex.matches_kv(rc, 'keyname', 'valuename'))
         assert_that(result, equal_to([("b", "c")]))
Example #52
0
class PubSubIntegrationTest(unittest.TestCase):

    ID_LABEL = 'id'
    TIMESTAMP_ATTRIBUTE = 'timestamp'
    INPUT_MESSAGES = {
        # TODO(BEAM-4275): DirectRunner doesn't support reading or writing
        # label_ids, nor writing timestamp attributes. Once these features exist,
        # TestDirectRunner and TestDataflowRunner should behave identically.
        'TestDirectRunner': [
            PubsubMessage('data001', {}),
            # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the
            # IT pipeline writes back the timestamp of each element (as reported
            # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
            PubsubMessage('data002', {
                TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
            }),
        ],
        'TestDataflowRunner': [
            # Use ID_LABEL attribute to deduplicate messages with the same ID.
            PubsubMessage('data001', {ID_LABEL: 'foo'}),
            PubsubMessage('data001', {ID_LABEL: 'foo'}),
            PubsubMessage('data001', {ID_LABEL: 'foo'}),
            # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the
            # IT pipeline writes back the timestamp of each element (as reported
            # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
            PubsubMessage('data002', {
                TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
            })
        ],
    }
    EXPECTED_OUTPUT_MESSAGES = {
        'TestDirectRunner': [
            PubsubMessage('data001-seen', {'processed': 'IT'}),
            PubsubMessage(
                'data002-seen', {
                    TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
                    TIMESTAMP_ATTRIBUTE + '_out':
                    '2018-07-11T02:02:50.149000Z',
                    'processed': 'IT',
                }),
        ],
        'TestDataflowRunner': [
            PubsubMessage('data001-seen', {'processed': 'IT'}),
            PubsubMessage(
                'data002-seen', {
                    TIMESTAMP_ATTRIBUTE + '_out':
                    '2018-07-11T02:02:50.149000Z',
                    'processed': 'IT',
                }),
        ],
    }

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.runner_name = type(self.test_pipeline.runner).__name__
        self.project = self.test_pipeline.get_option('project')
        self.uuid = str(uuid.uuid4())

        # Set up PubSub environment.
        from google.cloud import pubsub
        self.pubsub_client = pubsub.Client(project=self.project)
        self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid)
        self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid)
        self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid)
        self.output_sub = self.output_topic.subscription(OUTPUT_SUB +
                                                         self.uuid)

        self.input_topic.create()
        self.output_topic.create()
        test_utils.wait_for_topics_created(
            [self.input_topic, self.output_topic])
        self.input_sub.create()
        self.output_sub.create()

    def tearDown(self):
        test_utils.cleanup_subscriptions([self.input_sub, self.output_sub])
        test_utils.cleanup_topics([self.input_topic, self.output_topic])

    def _test_streaming(self, with_attributes):
        """Runs IT pipeline with message verifier.

    Args:
      with_attributes: False - Reads and writes message data only.
        True - Reads and writes message data and attributes. Also verifies
        id_label and timestamp_attribute features.
    """
        # Set on_success_matcher to verify pipeline state and pubsub output. These
        # verifications run on a (remote) worker.

        # Expect the state to be RUNNING since a streaming pipeline is usually
        # never DONE. The test runner will cancel the pipeline after verification.
        state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
        expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name]
        if not with_attributes:
            expected_messages = [
                pubsub_msg.data for pubsub_msg in expected_messages
            ]
        if self.runner_name == 'TestDirectRunner':
            strip_attributes = None
        else:
            strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]
        pubsub_msg_verifier = PubSubMessageMatcher(
            self.project,
            OUTPUT_SUB + self.uuid,
            expected_messages,
            timeout=MESSAGE_MATCHER_TIMEOUT_S,
            with_attributes=with_attributes,
            strip_attributes=strip_attributes)
        extra_opts = {
            'input_subscription': self.input_sub.full_name,
            'output_topic': self.output_topic.full_name,
            'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
            'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier)
        }

        # Generate input data and inject to PubSub.
        test_utils.wait_for_subscriptions_created([self.input_sub])
        for msg in self.INPUT_MESSAGES[self.runner_name]:
            self.input_topic.publish(msg.data, **msg.attributes)

        # Get pipeline options from command argument: --test-pipeline-options,
        # and start pipeline job by calling pipeline main function.
        pubsub_it_pipeline.run_pipeline(
            argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
            with_attributes=with_attributes,
            id_label=self.ID_LABEL,
            timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

    @attr('IT')
    def test_streaming_data_only(self):
        self._test_streaming(with_attributes=False)

    @attr('IT')
    def test_streaming_with_attributes(self):
        self._test_streaming(with_attributes=True)
Example #53
0
 def test_match_group_empty(self):
     with TestPipeline() as p:
         result = (p | beam.Create(["a", "b", "c", "d"])
                   | util.Regex.matches("x (?P<namedgroup>[xyz]*)",
                                        'namedgroup'))
         assert_that(result, equal_to([]))
Example #54
0
class PubSubIntegrationTest(unittest.TestCase):

  ID_LABEL = 'id'
  TIMESTAMP_ATTRIBUTE = 'timestamp'
  INPUT_MESSAGES = [
      # Use ID_LABEL attribute to deduplicate messages with the same ID.
      PubsubMessage('data001', {ID_LABEL: 'foo'}),
      PubsubMessage('data001', {ID_LABEL: 'foo'}),
      PubsubMessage('data001', {ID_LABEL: 'foo'}),
      # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the IT
      # pipeline writes back the timestamp of each element (as reported by
      # Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute.
      PubsubMessage('data002', {
          TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z',
      }),
  ]
  EXPECTED_OUTPUT_MESSAGES = [
      PubsubMessage('data001-seen', {'processed': 'IT'}),
      PubsubMessage('data002-seen', {
          TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z',
          'processed': 'IT',
      }),
  ]

  def setUp(self):
    self.test_pipeline = TestPipeline(is_integration_test=True)
    self.project = self.test_pipeline.get_option('project')
    self.uuid = str(uuid.uuid4())

    # Set up PubSub environment.
    from google.cloud import pubsub
    self.pubsub_client = pubsub.Client(project=self.project)
    self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid)
    self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid)
    self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid)
    self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid)

    self.input_topic.create()
    self.output_topic.create()
    test_utils.wait_for_topics_created([self.input_topic, self.output_topic])
    self.input_sub.create()
    self.output_sub.create()

  def tearDown(self):
    test_utils.cleanup_subscriptions([self.input_sub, self.output_sub])
    test_utils.cleanup_topics([self.input_topic, self.output_topic])

  def _test_streaming(self, with_attributes):
    """Runs IT pipeline with message verifier.

    Args:
      with_attributes: False - Reads and writes message data only.
        True - Reads and writes message data and attributes. Also verifies
        id_label and timestamp_attribute features.
    """
    # Build expected dataset.
    # Set extra options to the pipeline for test purpose
    state_verifier = PipelineStateMatcher(PipelineState.RUNNING)
    expected_messages = self.EXPECTED_OUTPUT_MESSAGES
    if not with_attributes:
      expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages]
    pubsub_msg_verifier = PubSubMessageMatcher(
        self.project,
        OUTPUT_SUB + self.uuid,
        expected_messages,
        timeout=MESSAGE_MATCHER_TIMEOUT_S,
        with_attributes=with_attributes,
        strip_attributes=[self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE])
    extra_opts = {'input_subscription': self.input_sub.full_name,
                  'output_topic': self.output_topic.full_name,
                  'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS,
                  'on_success_matcher': all_of(state_verifier,
                                               pubsub_msg_verifier)}

    # Generate input data and inject to PubSub.
    test_utils.wait_for_subscriptions_created([self.input_sub])
    for msg in self.INPUT_MESSAGES:
      self.input_topic.publish(msg.data, **msg.attributes)

    # Get pipeline options from command argument: --test-pipeline-options,
    # and start pipeline job by calling pipeline main function.
    pubsub_it_pipeline.run_pipeline(
        argv=self.test_pipeline.get_full_options_as_args(**extra_opts),
        with_attributes=with_attributes,
        id_label=self.ID_LABEL,
        timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

  @attr('IT')
  def test_streaming_data_only(self):
    self._test_streaming(with_attributes=False)

  @attr('IT')
  def test_streaming_with_attributes(self):
    self._test_streaming(with_attributes=True)
Example #55
0
 def test_reshuffle_contents_unchanged(self):
     pipeline = TestPipeline()
     data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)]
     result = (pipeline | beam.Create(data) | beam.Reshuffle())
     assert_that(result, equal_to(data))
     pipeline.run()
Example #56
0
 def test_fake_read(self):
     pipeline = TestPipeline()
     pcoll = pipeline | 'read' >> Read(FakeSource([1, 2, 3]))
     assert_that(pcoll, equal_to([1, 2, 3]))
     pipeline.run()
Example #57
0
 def test_constant_k(self):
     with TestPipeline() as p:
         pc = p | beam.Create(self.l)
         with_keys = pc | util.WithKeys('k')
     assert_that(with_keys, equal_to([('k', 1), ('k', 2), ('k', 3)], ))
Example #58
0
 def test_callable_k(self):
     with TestPipeline() as p:
         pc = p | beam.Create(self.l)
         with_keys = pc | util.WithKeys(lambda x: x * x)
     assert_that(with_keys, equal_to([(1, 1), (4, 2), (9, 3)]))
Example #59
0
 def test_apply_custom_transform(self):
     pipeline = TestPipeline()
     pcoll = pipeline | 'pcoll' >> Create([1, 2, 3])
     result = pcoll | PipelineTest.CustomTransform()
     assert_that(result, equal_to([2, 3, 4]))
     pipeline.run()
Example #60
0
 def test_match_none(self):
     with TestPipeline() as p:
         result = (p | beam.Create(["a", "b", "c", "d"])
                   | util.Regex.matches("[xyz]"))
         assert_that(result, equal_to([]))