Ejemplo n.º 1
0
 def test_timestamped_with_combiners(self):
   p = TestPipeline()
   result = (p
             # Create some initial test values.
             | 'start' >> Create([(k, k) for k in range(10)])
             # The purpose of the WindowInto transform is to establish a
             # FixedWindows windowing function for the PCollection.
             # It does not bucket elements into windows since the timestamps
             # from Create are not spaced 5 ms apart and very likely they all
             # fall into the same window.
             | 'w' >> WindowInto(FixedWindows(5))
             # Generate timestamped values using the values as timestamps.
             # Now there are values 5 ms apart and since Map propagates the
             # windowing function from input to output the output PCollection
             # will have elements falling into different 5ms windows.
             | Map(lambda (x, t): TimestampedValue(x, t))
             # We add a 'key' to each value representing the index of the
             # window. This is important since there is no guarantee of
             # order for the elements of a PCollection.
             | Map(lambda v: (v / 5, v)))
   # Sum all elements associated with a key and window. Although it
   # is called CombinePerKey it is really CombinePerKeyAndWindow the
   # same way GroupByKey is really GroupByKeyAndWindow.
   sum_per_window = result | CombinePerKey(sum)
   # Compute mean per key and window.
   mean_per_window = result | combiners.Mean.PerKey()
   assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]),
               label='assert:sum')
   assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]),
               label='assert:mean')
   p.run()
Ejemplo n.º 2
0
 def test_dataflow_single_file(self):
   file_name, expected_data = write_data(5)
   assert len(expected_data) == 5
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(file_name)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
 def run_pipeline(self, count_implementation, factor=1):
   p = TestPipeline()
   words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG'])
   result = words | count_implementation
   assert_that(
       result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))]))
   p.run()
Ejemplo n.º 4
0
 def test_timestamped_with_combiners(self):
   p = TestPipeline()
   result = (p
             # Create some initial test values.
             | 'start' >> Create([(k, k) for k in range(10)])
             # The purpose of the WindowInto transform is to establish a
             # FixedWindows windowing function for the PCollection.
             # It does not bucket elements into windows since the timestamps
             # from Create are not spaced 5 ms apart and very likely they all
             # fall into the same window.
             | 'w' >> WindowInto(FixedWindows(5))
             # Generate timestamped values using the values as timestamps.
             # Now there are values 5 ms apart and since Map propagates the
             # windowing function from input to output the output PCollection
             # will have elements falling into different 5ms windows.
             | Map(lambda (x, t): TimestampedValue(x, t))
             # We add a 'key' to each value representing the index of the
             # window. This is important since there is no guarantee of
             # order for the elements of a PCollection.
             | Map(lambda v: (v / 5, v)))
   # Sum all elements associated with a key and window. Although it
   # is called CombinePerKey it is really CombinePerKeyAndWindow the
   # same way GroupByKey is really GroupByKeyAndWindow.
   sum_per_window = result | CombinePerKey(sum)
   # Compute mean per key and window.
   mean_per_window = result | combiners.Mean.PerKey()
   assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]),
               label='assert:sum')
   assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]),
               label='assert:mean')
   p.run()
Ejemplo n.º 5
0
 def run_pipeline(self, count_implementation, factor=1):
   p = TestPipeline()
   words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG'])
   result = words | count_implementation
   assert_that(
       result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))]))
   p.run()
Ejemplo n.º 6
0
    def test_to_list_and_to_dict(self):
        pipeline = TestPipeline()
        the_list = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
        pcoll = pipeline | 'start' >> Create(the_list)
        result = pcoll | 'to list' >> combine.ToList()

        def matcher(expected):
            def match(actual):
                equal_to(expected[0])(actual[0])

            return match

        assert_that(result, matcher([the_list]))
        pipeline.run()

        pipeline = TestPipeline()
        pairs = [(1, 2), (3, 4), (5, 6)]
        pcoll = pipeline | 'start-pairs' >> Create(pairs)
        result = pcoll | 'to dict' >> combine.ToDict()

        def matcher():
            def match(actual):
                equal_to([1])([len(actual)])
                equal_to(pairs)(actual[0].iteritems())

            return match

        assert_that(result, matcher())
        pipeline.run()
Ejemplo n.º 7
0
 def test_dataflow_file_pattern(self):
     pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
     assert len(expected_data) == 40
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(pattern)
     assert_that(pcoll, equal_to(expected_data))
     pipeline.run()
Ejemplo n.º 8
0
  def test_run_direct(self):
    file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
    pipeline = TestPipeline()
    pcoll = pipeline | beam.io.Read(LineSource(file_name))
    assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

    pipeline.run()
Ejemplo n.º 9
0
 def test_pardo(self):
     with self.create_pipeline() as p:
         res = (p
                | beam.Create(['a', 'bc'])
                | beam.Map(lambda e: e * 2)
                | beam.Map(lambda e: e + 'x'))
         assert_that(res, equal_to(['aax', 'bcbcx']))
Ejemplo n.º 10
0
    def test_run_direct(self):
        file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
        pipeline = TestPipeline()
        pcoll = pipeline | beam.io.Read(LineSource(file_name))
        assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

        pipeline.run()
Ejemplo n.º 11
0
 def test_group_by_key(self):
     with self.create_pipeline() as p:
         res = (p
                | beam.Create([('a', 1), ('a', 2), ('b', 3)])
                | beam.GroupByKey()
                | beam.Map(lambda (k, vs): (k, sorted(vs))))
         assert_that(res, equal_to([('a', [1, 2]), ('b', [3])]))
Ejemplo n.º 12
0
 def test_dataflow_file_pattern(self):
   pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4])
   assert len(expected_data) == 40
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(pattern)
   assert_that(pcoll, equal_to(expected_data))
   pipeline.run()
Ejemplo n.º 13
0
 def test_dataflow_single_file(self):
     file_name, expected_data = write_data(5)
     assert len(expected_data) == 5
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(file_name)
     assert_that(pcoll, equal_to(expected_data))
     pipeline.run()
Ejemplo n.º 14
0
 def test_read_gzip_empty_file(self):
     file_name = self._create_temp_file()
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(
         file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
     assert_that(pcoll, equal_to([]))
     pipeline.run()
Ejemplo n.º 15
0
  def test_to_list_and_to_dict(self):
    pipeline = TestPipeline()
    the_list = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6]
    pcoll = pipeline | 'start' >> Create(the_list)
    result = pcoll | 'to list' >> combine.ToList()

    def matcher(expected):
      def match(actual):
        equal_to(expected[0])(actual[0])
      return match
    assert_that(result, matcher([the_list]))
    pipeline.run()

    pipeline = TestPipeline()
    pairs = [(1, 2), (3, 4), (5, 6)]
    pcoll = pipeline | 'start-pairs' >> Create(pairs)
    result = pcoll | 'to dict' >> combine.ToDict()

    def matcher():
      def match(actual):
        equal_to([1])([len(actual)])
        equal_to(pairs)(actual[0].iteritems())
      return match
    assert_that(result, matcher())
    pipeline.run()
Ejemplo n.º 16
0
 def test_read(self):
     with tempfile.NamedTemporaryFile() as temp_file:
         temp_file.write('a\nb\nc')
         temp_file.flush()
         with self.create_pipeline() as p:
             assert_that(p | beam.io.ReadFromText(temp_file.name),
                         equal_to(['a', 'b', 'c']))
Ejemplo n.º 17
0
 def test_compute_points(self):
   p = TestPipeline()
   records = p | 'create' >> beam.Create(self.SAMPLE_RECORDS)
   result = (records
             | 'points' >> beam.FlatMap(coders.compute_points)
             | beam.CombinePerKey(sum))
   assert_that(result, equal_to([('Italy', 0), ('Brasil', 6), ('Germany', 3)]))
   p.run()
Ejemplo n.º 18
0
 def test_default_value_singleton_side_input(self):
   pipeline = self.create_pipeline()
   pcol = pipeline | 'start' >> beam.Create([1, 2])
   side = pipeline | 'side' >> beam.Create([])  # 0 values in side input.
   result = pcol | beam.FlatMap(
       lambda x, s: [x * s], beam.pvalue.AsSingleton(side, 10))
   assert_that(result, equal_to([10, 20]))
   pipeline.run()
Ejemplo n.º 19
0
  def test_basics(self):
    p = TestPipeline()
    result = p | 'Estimate' >> estimate_pi.EstimatePiTransform(5000)

    # Note: Probabilistically speaking this test can fail with a probability
    # that is very small (VERY) given that we run at least 500 thousand trials.
    assert_that(result, in_between(3.125, 3.155))
    p.run()
Ejemplo n.º 20
0
    def test_basics(self):
        p = TestPipeline()
        result = p | 'Estimate' >> estimate_pi.EstimatePiTransform(5000)

        # Note: Probabilistically speaking this test can fail with a probability
        # that is very small (VERY) given that we run at least 500 thousand trials.
        assert_that(result, in_between(3.125, 3.155))
        p.run()
Ejemplo n.º 21
0
 def test_read_gzip_empty_file(self):
     filename = tempfile.NamedTemporaryFile(delete=False,
                                            prefix=tempfile.template).name
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> ReadFromText(
         filename, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
     assert_that(pcoll, equal_to([]))
     pipeline.run()
Ejemplo n.º 22
0
 def test_default_value_singleton_side_input(self):
   pipeline = self.create_pipeline()
   pcol = pipeline | 'start' >> beam.Create([1, 2])
   side = pipeline | 'side' >> beam.Create([])  # 0 values in side input.
   result = pcol | beam.FlatMap(
       lambda x, s: [x * s], beam.pvalue.AsSingleton(side, 10))
   assert_that(result, equal_to([10, 20]))
   pipeline.run()
Ejemplo n.º 23
0
 def test_tuple_combine_fn(self):
     p = TestPipeline()
     result = (p
               | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)])
               | beam.CombineGlobally(
                   combine.TupleCombineFn(max, combine.MeanCombineFn(),
                                          sum)).without_defaults())
     assert_that(result, equal_to([('c', 111.0 / 3, 99.0)]))
     p.run()
Ejemplo n.º 24
0
  def test_element(self):
    class TestDoFn(DoFn):
      def process(self, element):
        yield element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
Ejemplo n.º 25
0
 def test_iterable_side_input(self):
   pipeline = self.create_pipeline()
   pcol = pipeline | 'start' >> beam.Create([1, 2])
   side = pipeline | 'side' >> beam.Create([3, 4])  # 2 values in side input.
   result = pcol | 'compute' >> beam.FlatMap(
       lambda x, s: [x * y for y in s],
       beam.pvalue.AsIter(side))
   assert_that(result, equal_to([3, 4, 6, 8]))
   pipeline.run()
Ejemplo n.º 26
0
 def test_read_gzip_empty_file(self):
   file_name = self._create_temp_file()
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> ReadFromText(
       file_name,
       0, CompressionTypes.GZIP,
       True, coders.StrUtf8Coder())
   assert_that(pcoll, equal_to([]))
   pipeline.run()
Ejemplo n.º 27
0
  def test_context_param(self):
    class TestDoFn(DoFn):
      def process(self, element, context=DoFn.ContextParam):
        yield context.element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
Ejemplo n.º 28
0
  def test_timestamp_param(self):
    class TestDoFn(DoFn):
      def process(self, element, timestamp=DoFn.TimestampParam):
        yield timestamp

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
    pipeline.run()
Ejemplo n.º 29
0
 def test_iterable_side_input(self):
   pipeline = self.create_pipeline()
   pcol = pipeline | 'start' >> beam.Create([1, 2])
   side = pipeline | 'side' >> beam.Create([3, 4])  # 2 values in side input.
   result = pcol | 'compute' >> beam.FlatMap(
       lambda x, s: [x * y for y in s],
       beam.pvalue.AsIter(side))
   assert_that(result, equal_to([3, 4, 6, 8]))
   pipeline.run()
Ejemplo n.º 30
0
  def test_element(self):
    class TestDoFn(DoFn):
      def process(self, element):
        yield element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
Ejemplo n.º 31
0
  def test_timestamp_param(self):
    class TestDoFn(DoFn):
      def process(self, element, timestamp=DoFn.TimestampParam):
        yield timestamp

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
    pipeline.run()
Ejemplo n.º 32
0
  def test_context_param(self):
    class TestDoFn(DoFn):
      def process(self, element, context=DoFn.ContextParam):
        yield context.element + 10

    pipeline = TestPipeline()
    pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn())
    assert_that(pcoll, equal_to([11, 12]))
    pipeline.run()
Ejemplo n.º 33
0
 def test_windowing(self):
     with self.create_pipeline() as p:
         res = (p
                | beam.Create([1, 2, 100, 101, 102])
                | beam.Map(lambda t: TimestampedValue(('k', t), t))
                | beam.WindowInto(beam.transforms.window.Sessions(10))
                | beam.GroupByKey()
                | beam.Map(lambda (k, vs): (k, sorted(vs))))
         assert_that(res, equal_to([('k', [1, 2]), ('k', [100, 101, 102])]))
Ejemplo n.º 34
0
  def test_run_concat_direct(self):
    source = ConcatSource([RangeSource(0, 10),
                           RangeSource(10, 100),
                           RangeSource(100, 1000),
                          ])
    pipeline = TestPipeline()
    pcoll = pipeline | beam.Read(source)
    assert_that(pcoll, equal_to(range(1000)))

    pipeline.run()
Ejemplo n.º 35
0
 def test_reuse_cloned_custom_transform_instance(self):
   pipeline = TestPipeline()
   pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3])
   pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6])
   transform = PipelineTest.CustomTransform()
   result1 = pcoll1 | transform
   result2 = pcoll2 | 'new_label' >> transform
   assert_that(result1, equal_to([2, 3, 4]), label='r1')
   assert_that(result2, equal_to([5, 6, 7]), label='r2')
   pipeline.run()
Ejemplo n.º 36
0
 def test_metrics_in_source(self):
   pipeline = TestPipeline()
   pcoll = pipeline | Read(FakeSource([1, 2, 3, 4, 5, 6]))
   assert_that(pcoll, equal_to([1, 2, 3, 4, 5, 6]))
   res = pipeline.run()
   metric_results = res.metrics().query()
   outputs_counter = metric_results['counters'][0]
   self.assertEqual(outputs_counter.key.step, 'Read')
   self.assertEqual(outputs_counter.key.metric.name, 'outputs')
   self.assertEqual(outputs_counter.committed, 6)
Ejemplo n.º 37
0
  def test_create(self):
    pipeline = TestPipeline()
    pcoll = pipeline | 'label1' >> Create([1, 2, 3])
    assert_that(pcoll, equal_to([1, 2, 3]))

    # Test if initial value is an iterator object.
    pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6)))
    pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10])
    assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
    pipeline.run()
Ejemplo n.º 38
0
    def test_flattened_side_input(self):
        pipeline = self.create_pipeline()
        main_input = pipeline | 'main input' >> beam.Create([None])
        side_input = (pipeline | 'side1' >> beam.Create(['a']), pipeline
                      | 'side2' >> beam.Create(['b'])) | beam.Flatten()
        results = main_input | beam.FlatMap(lambda _, ab: ab,
                                            beam.pvalue.AsList(side_input))

        assert_that(results, equal_to(['a', 'b']))
        pipeline.run()
Ejemplo n.º 39
0
    def test_read_auto_bzip2(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file(suffix='.bz2')
        with bz2.BZ2File(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(file_name)
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
Ejemplo n.º 40
0
 def test_tuple_combine_fn(self):
   p = TestPipeline()
   result = (
       p
       | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)])
       | beam.CombineGlobally(combine.TupleCombineFn(max,
                                                     combine.MeanCombineFn(),
                                                     sum)).without_defaults())
   assert_that(result, equal_to([('c', 111.0 / 3, 99.0)]))
   p.run()
Ejemplo n.º 41
0
 def test_sink_transform(self):
   with tempfile.NamedTemporaryFile() as dst:
     path = dst.name
     with TestPipeline() as p:
       # pylint: disable=expression-not-assigned
       p | beam.Create(self.RECORDS) | avroio.WriteToAvro(path, self.SCHEMA)
     with TestPipeline() as p:
       # json used for stable sortability
       readback = p | avroio.ReadFromAvro(path + '*') | beam.Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
Ejemplo n.º 42
0
 def test_sink_transform(self):
   with tempfile.NamedTemporaryFile() as dst:
     path = dst.name
     with TestPipeline() as p:
       # pylint: disable=expression-not-assigned
       p | beam.Create(self.RECORDS) | avroio.WriteToAvro(path, self.SCHEMA)
     with TestPipeline() as p:
       # json used for stable sortability
       readback = p | avroio.ReadFromAvro(path + '*') | beam.Map(json.dumps)
       assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
Ejemplo n.º 43
0
 def test_tuple_combine_fn_without_defaults(self):
   p = TestPipeline()
   result = (
       p
       | Create([1, 1, 2, 3])
       | beam.CombineGlobally(
           combine.TupleCombineFn(min, combine.MeanCombineFn(), max)
           .with_common_input()).without_defaults())
   assert_that(result, equal_to([(1, 7.0 / 4, 3)]))
   p.run()
Ejemplo n.º 44
0
 def test_tuple_combine_fn_without_defaults(self):
     p = TestPipeline()
     result = (p
               | Create([1, 1, 2, 3])
               | beam.CombineGlobally(
                   combine.TupleCombineFn(
                       min, combine.MeanCombineFn(),
                       max).with_common_input()).without_defaults())
     assert_that(result, equal_to([(1, 7.0 / 4, 3)]))
     p.run()
Ejemplo n.º 45
0
  def test_read_auto_bzip2(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file(suffix='.bz2')
    with bz2.BZ2File(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(file_name)
    assert_that(pcoll, equal_to(lines))
    pipeline.run()
Ejemplo n.º 46
0
    def test_create(self):
        pipeline = TestPipeline()
        pcoll = pipeline | 'label1' >> Create([1, 2, 3])
        assert_that(pcoll, equal_to([1, 2, 3]))

        # Test if initial value is an iterator object.
        pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6)))
        pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10])
        assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3')
        pipeline.run()
Ejemplo n.º 47
0
 def test_metrics_in_source(self):
     pipeline = TestPipeline()
     pcoll = pipeline | Read(FakeSource([1, 2, 3, 4, 5, 6]))
     assert_that(pcoll, equal_to([1, 2, 3, 4, 5, 6]))
     res = pipeline.run()
     metric_results = res.metrics().query()
     outputs_counter = metric_results['counters'][0]
     self.assertEqual(outputs_counter.key.step, 'Read')
     self.assertEqual(outputs_counter.key.metric.name, 'outputs')
     self.assertEqual(outputs_counter.committed, 6)
Ejemplo n.º 48
0
 def test_reuse_cloned_custom_transform_instance(self):
     pipeline = TestPipeline()
     pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3])
     pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6])
     transform = PipelineTest.CustomTransform()
     result1 = pcoll1 | transform
     result2 = pcoll2 | 'new_label' >> transform
     assert_that(result1, equal_to([2, 3, 4]), label='r1')
     assert_that(result2, equal_to([5, 6, 7]), label='r2')
     pipeline.run()
Ejemplo n.º 49
0
    def test_read_gzip(self):
        _, lines = write_data(15)
        file_name = self._create_temp_file()
        with gzip.GzipFile(file_name, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> ReadFromText(
            file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder())
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
Ejemplo n.º 50
0
 def test_timestamped_value(self):
   p = TestPipeline()
   result = (p
             | 'start' >> Create([(k, k) for k in range(10)])
             | Map(lambda (x, t): TimestampedValue(x, t))
             | 'w' >> WindowInto(FixedWindows(5))
             | Map(lambda v: ('key', v))
             | GroupByKey())
   assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]),
                                 ('key', [5, 6, 7, 8, 9])]))
   p.run()
Ejemplo n.º 51
0
 def test_timestamped_value(self):
   p = TestPipeline()
   result = (p
             | 'start' >> Create([(k, k) for k in range(10)])
             | Map(lambda (x, t): TimestampedValue(x, t))
             | 'w' >> WindowInto(FixedWindows(5))
             | Map(lambda v: ('key', v))
             | GroupByKey())
   assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]),
                                 ('key', [5, 6, 7, 8, 9])]))
   p.run()
Ejemplo n.º 52
0
  def test_global_sample(self):
    def is_good_sample(actual):
      assert len(actual) == 1
      assert sorted(actual[0]) in [[1, 1, 2], [1, 2, 2]], actual

    with TestPipeline() as pipeline:
      pcoll = pipeline | 'start' >> Create([1, 1, 2, 2])
      for ix in xrange(9):
        assert_that(
            pcoll | 'sample-%d' % ix >> combine.Sample.FixedSizeGlobally(3),
            is_good_sample,
            label='check-%d' % ix)
Ejemplo n.º 53
0
  def test_read_gzip_with_skip_lines(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file()
    with gzip.GzipFile(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(
        file_name, 0, CompressionTypes.GZIP,
        True, coders.StrUtf8Coder(), skip_header_lines=2)
    assert_that(pcoll, equal_to(lines[2:]))
    pipeline.run()
Ejemplo n.º 54
0
  def test_flattened_side_input(self):
    pipeline = self.create_pipeline()
    main_input = pipeline | 'main input' >> beam.Create([None])
    side_input = (
        pipeline | 'side1' >> beam.Create(['a']),
        pipeline | 'side2' >> beam.Create(['b'])) | beam.Flatten()
    results = main_input | beam.FlatMap(
        lambda _, ab: ab,
        beam.pvalue.AsList(side_input))

    assert_that(results, equal_to(['a', 'b']))
    pipeline.run()
Ejemplo n.º 55
0
 def test_sliding_windows(self):
   p = TestPipeline()
   pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3)
   result = (pcoll
             | 'w' >> WindowInto(SlidingWindows(period=2, size=4))
             | GroupByKey()
             | reify_windows)
   expected = [('key @ [-2.0, 2.0)', [1]),
               ('key @ [0.0, 4.0)', [1, 2, 3]),
               ('key @ [2.0, 6.0)', [2, 3])]
   assert_that(result, equal_to(expected))
   p.run()
Ejemplo n.º 56
0
 def test_sessions(self):
   p = TestPipeline()
   pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27)
   result = (pcoll
             | 'w' >> WindowInto(Sessions(10))
             | GroupByKey()
             | sort_values
             | reify_windows)
   expected = [('key @ [1.0, 13.0)', [1, 2, 3]),
               ('key @ [20.0, 45.0)', [20, 27, 35])]
   assert_that(result, equal_to(expected))
   p.run()
Ejemplo n.º 57
0
  def test_read_bzip2(self):
    _, lines = write_data(15)
    file_name = self._create_temp_file()
    with bz2.BZ2File(file_name, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> ReadFromText(
        file_name,
        compression_type=CompressionTypes.BZIP2)
    assert_that(pcoll, equal_to(lines))
    pipeline.run()
Ejemplo n.º 58
0
  def test_read_auto_single_file_gzip(self):
    _, lines = write_data(10)
    filename = tempfile.NamedTemporaryFile(
        delete=False, prefix=tempfile.template, suffix='.gz').name
    with gzip.GzipFile(filename, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> beam.io.Read(LineSource(
        filename,
        compression_type=CompressionTypes.AUTO))
    assert_that(pcoll, equal_to(lines))
    pipeline.run()
Ejemplo n.º 59
0
  def test_combine_globally_with_default_side_input(self):
    class CombineWithSideInput(PTransform):
      def expand(self, pcoll):
        side = pcoll | CombineGlobally(sum).as_singleton_view()
        main = pcoll.pipeline | Create([None])
        return main | Map(lambda _, s: s, side)

    p = TestPipeline()
    result1 = p | 'i1' >> Create([]) | 'c1' >> CombineWithSideInput()
    result2 = p | 'i2' >> Create([1, 2, 3, 4]) | 'c2' >> CombineWithSideInput()
    assert_that(result1, equal_to([0]), label='r1')
    assert_that(result2, equal_to([10]), label='r2')
    p.run()