def test_timestamped_with_combiners(self): p = TestPipeline() result = (p # Create some initial test values. | 'start' >> Create([(k, k) for k in range(10)]) # The purpose of the WindowInto transform is to establish a # FixedWindows windowing function for the PCollection. # It does not bucket elements into windows since the timestamps # from Create are not spaced 5 ms apart and very likely they all # fall into the same window. | 'w' >> WindowInto(FixedWindows(5)) # Generate timestamped values using the values as timestamps. # Now there are values 5 ms apart and since Map propagates the # windowing function from input to output the output PCollection # will have elements falling into different 5ms windows. | Map(lambda (x, t): TimestampedValue(x, t)) # We add a 'key' to each value representing the index of the # window. This is important since there is no guarantee of # order for the elements of a PCollection. | Map(lambda v: (v / 5, v))) # Sum all elements associated with a key and window. Although it # is called CombinePerKey it is really CombinePerKeyAndWindow the # same way GroupByKey is really GroupByKeyAndWindow. sum_per_window = result | CombinePerKey(sum) # Compute mean per key and window. mean_per_window = result | combiners.Mean.PerKey() assert_that(sum_per_window, equal_to([(0, 10), (1, 35)]), label='assert:sum') assert_that(mean_per_window, equal_to([(0, 2.0), (1, 7.0)]), label='assert:mean') p.run()
def test_dataflow_single_file(self): file_name, expected_data = write_data(5) assert len(expected_data) == 5 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def run_pipeline(self, count_implementation, factor=1): p = TestPipeline() words = p | beam.Create(['CAT', 'DOG', 'CAT', 'CAT', 'DOG']) result = words | count_implementation assert_that( result, equal_to([('CAT', (3 * factor)), ('DOG', (2 * factor))])) p.run()
def test_to_list_and_to_dict(self): pipeline = TestPipeline() the_list = [6, 3, 1, 1, 9, 1, 5, 2, 0, 6] pcoll = pipeline | 'start' >> Create(the_list) result = pcoll | 'to list' >> combine.ToList() def matcher(expected): def match(actual): equal_to(expected[0])(actual[0]) return match assert_that(result, matcher([the_list])) pipeline.run() pipeline = TestPipeline() pairs = [(1, 2), (3, 4), (5, 6)] pcoll = pipeline | 'start-pairs' >> Create(pairs) result = pcoll | 'to dict' >> combine.ToDict() def matcher(): def match(actual): equal_to([1])([len(actual)]) equal_to(pairs)(actual[0].iteritems()) return match assert_that(result, matcher()) pipeline.run()
def test_dataflow_file_pattern(self): pattern, expected_data = write_pattern([5, 3, 12, 8, 8, 4]) assert len(expected_data) == 40 pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(pattern) assert_that(pcoll, equal_to(expected_data)) pipeline.run()
def test_run_direct(self): file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd') pipeline = TestPipeline() pcoll = pipeline | beam.io.Read(LineSource(file_name)) assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd'])) pipeline.run()
def test_pardo(self): with self.create_pipeline() as p: res = (p | beam.Create(['a', 'bc']) | beam.Map(lambda e: e * 2) | beam.Map(lambda e: e + 'x')) assert_that(res, equal_to(['aax', 'bcbcx']))
def test_group_by_key(self): with self.create_pipeline() as p: res = (p | beam.Create([('a', 1), ('a', 2), ('b', 3)]) | beam.GroupByKey() | beam.Map(lambda (k, vs): (k, sorted(vs)))) assert_that(res, equal_to([('a', [1, 2]), ('b', [3])]))
def test_read_gzip_empty_file(self): file_name = self._create_temp_file() pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def test_read(self): with tempfile.NamedTemporaryFile() as temp_file: temp_file.write('a\nb\nc') temp_file.flush() with self.create_pipeline() as p: assert_that(p | beam.io.ReadFromText(temp_file.name), equal_to(['a', 'b', 'c']))
def test_compute_points(self): p = TestPipeline() records = p | 'create' >> beam.Create(self.SAMPLE_RECORDS) result = (records | 'points' >> beam.FlatMap(coders.compute_points) | beam.CombinePerKey(sum)) assert_that(result, equal_to([('Italy', 0), ('Brasil', 6), ('Germany', 3)])) p.run()
def test_default_value_singleton_side_input(self): pipeline = self.create_pipeline() pcol = pipeline | 'start' >> beam.Create([1, 2]) side = pipeline | 'side' >> beam.Create([]) # 0 values in side input. result = pcol | beam.FlatMap( lambda x, s: [x * s], beam.pvalue.AsSingleton(side, 10)) assert_that(result, equal_to([10, 20])) pipeline.run()
def test_basics(self): p = TestPipeline() result = p | 'Estimate' >> estimate_pi.EstimatePiTransform(5000) # Note: Probabilistically speaking this test can fail with a probability # that is very small (VERY) given that we run at least 500 thousand trials. assert_that(result, in_between(3.125, 3.155)) p.run()
def test_read_gzip_empty_file(self): filename = tempfile.NamedTemporaryFile(delete=False, prefix=tempfile.template).name pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( filename, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to([])) pipeline.run()
def test_tuple_combine_fn(self): p = TestPipeline() result = (p | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)]) | beam.CombineGlobally( combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)).without_defaults()) assert_that(result, equal_to([('c', 111.0 / 3, 99.0)])) p.run()
def test_element(self): class TestDoFn(DoFn): def process(self, element): yield element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_iterable_side_input(self): pipeline = self.create_pipeline() pcol = pipeline | 'start' >> beam.Create([1, 2]) side = pipeline | 'side' >> beam.Create([3, 4]) # 2 values in side input. result = pcol | 'compute' >> beam.FlatMap( lambda x, s: [x * y for y in s], beam.pvalue.AsIter(side)) assert_that(result, equal_to([3, 4, 6, 8])) pipeline.run()
def test_context_param(self): class TestDoFn(DoFn): def process(self, element, context=DoFn.ContextParam): yield context.element + 10 pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2])| 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([11, 12])) pipeline.run()
def test_timestamp_param(self): class TestDoFn(DoFn): def process(self, element, timestamp=DoFn.TimestampParam): yield timestamp pipeline = TestPipeline() pcoll = pipeline | 'Create' >> Create([1, 2]) | 'Do' >> ParDo(TestDoFn()) assert_that(pcoll, equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP])) pipeline.run()
def test_windowing(self): with self.create_pipeline() as p: res = (p | beam.Create([1, 2, 100, 101, 102]) | beam.Map(lambda t: TimestampedValue(('k', t), t)) | beam.WindowInto(beam.transforms.window.Sessions(10)) | beam.GroupByKey() | beam.Map(lambda (k, vs): (k, sorted(vs)))) assert_that(res, equal_to([('k', [1, 2]), ('k', [100, 101, 102])]))
def test_run_concat_direct(self): source = ConcatSource([RangeSource(0, 10), RangeSource(10, 100), RangeSource(100, 1000), ]) pipeline = TestPipeline() pcoll = pipeline | beam.Read(source) assert_that(pcoll, equal_to(range(1000))) pipeline.run()
def test_reuse_cloned_custom_transform_instance(self): pipeline = TestPipeline() pcoll1 = pipeline | 'pc1' >> Create([1, 2, 3]) pcoll2 = pipeline | 'pc2' >> Create([4, 5, 6]) transform = PipelineTest.CustomTransform() result1 = pcoll1 | transform result2 = pcoll2 | 'new_label' >> transform assert_that(result1, equal_to([2, 3, 4]), label='r1') assert_that(result2, equal_to([5, 6, 7]), label='r2') pipeline.run()
def test_metrics_in_source(self): pipeline = TestPipeline() pcoll = pipeline | Read(FakeSource([1, 2, 3, 4, 5, 6])) assert_that(pcoll, equal_to([1, 2, 3, 4, 5, 6])) res = pipeline.run() metric_results = res.metrics().query() outputs_counter = metric_results['counters'][0] self.assertEqual(outputs_counter.key.step, 'Read') self.assertEqual(outputs_counter.key.metric.name, 'outputs') self.assertEqual(outputs_counter.committed, 6)
def test_create(self): pipeline = TestPipeline() pcoll = pipeline | 'label1' >> Create([1, 2, 3]) assert_that(pcoll, equal_to([1, 2, 3])) # Test if initial value is an iterator object. pcoll2 = pipeline | 'label2' >> Create(iter((4, 5, 6))) pcoll3 = pcoll2 | 'do' >> FlatMap(lambda x: [x + 10]) assert_that(pcoll3, equal_to([14, 15, 16]), label='pcoll3') pipeline.run()
def test_flattened_side_input(self): pipeline = self.create_pipeline() main_input = pipeline | 'main input' >> beam.Create([None]) side_input = (pipeline | 'side1' >> beam.Create(['a']), pipeline | 'side2' >> beam.Create(['b'])) | beam.Flatten() results = main_input | beam.FlatMap(lambda _, ab: ab, beam.pvalue.AsList(side_input)) assert_that(results, equal_to(['a', 'b'])) pipeline.run()
def test_read_auto_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file(suffix='.bz2') with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText(file_name) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_tuple_combine_fn(self): p = TestPipeline() result = ( p | Create([('a', 100, 0.0), ('b', 10, -1), ('c', 1, 100)]) | beam.CombineGlobally(combine.TupleCombineFn(max, combine.MeanCombineFn(), sum)).without_defaults()) assert_that(result, equal_to([('c', 111.0 / 3, 99.0)])) p.run()
def test_sink_transform(self): with tempfile.NamedTemporaryFile() as dst: path = dst.name with TestPipeline() as p: # pylint: disable=expression-not-assigned p | beam.Create(self.RECORDS) | avroio.WriteToAvro(path, self.SCHEMA) with TestPipeline() as p: # json used for stable sortability readback = p | avroio.ReadFromAvro(path + '*') | beam.Map(json.dumps) assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
def test_tuple_combine_fn_without_defaults(self): p = TestPipeline() result = ( p | Create([1, 1, 2, 3]) | beam.CombineGlobally( combine.TupleCombineFn(min, combine.MeanCombineFn(), max) .with_common_input()).without_defaults()) assert_that(result, equal_to([(1, 7.0 / 4, 3)])) p.run()
def test_tuple_combine_fn_without_defaults(self): p = TestPipeline() result = (p | Create([1, 1, 2, 3]) | beam.CombineGlobally( combine.TupleCombineFn( min, combine.MeanCombineFn(), max).with_common_input()).without_defaults()) assert_that(result, equal_to([(1, 7.0 / 4, 3)])) p.run()
def test_read_gzip(self): _, lines = write_data(15) file_name = self._create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder()) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_timestamped_value(self): p = TestPipeline() result = (p | 'start' >> Create([(k, k) for k in range(10)]) | Map(lambda (x, t): TimestampedValue(x, t)) | 'w' >> WindowInto(FixedWindows(5)) | Map(lambda v: ('key', v)) | GroupByKey()) assert_that(result, equal_to([('key', [0, 1, 2, 3, 4]), ('key', [5, 6, 7, 8, 9])])) p.run()
def test_global_sample(self): def is_good_sample(actual): assert len(actual) == 1 assert sorted(actual[0]) in [[1, 1, 2], [1, 2, 2]], actual with TestPipeline() as pipeline: pcoll = pipeline | 'start' >> Create([1, 1, 2, 2]) for ix in xrange(9): assert_that( pcoll | 'sample-%d' % ix >> combine.Sample.FixedSizeGlobally(3), is_good_sample, label='check-%d' % ix)
def test_read_gzip_with_skip_lines(self): _, lines = write_data(15) file_name = self._create_temp_file() with gzip.GzipFile(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, 0, CompressionTypes.GZIP, True, coders.StrUtf8Coder(), skip_header_lines=2) assert_that(pcoll, equal_to(lines[2:])) pipeline.run()
def test_flattened_side_input(self): pipeline = self.create_pipeline() main_input = pipeline | 'main input' >> beam.Create([None]) side_input = ( pipeline | 'side1' >> beam.Create(['a']), pipeline | 'side2' >> beam.Create(['b'])) | beam.Flatten() results = main_input | beam.FlatMap( lambda _, ab: ab, beam.pvalue.AsList(side_input)) assert_that(results, equal_to(['a', 'b'])) pipeline.run()
def test_sliding_windows(self): p = TestPipeline() pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3) result = (pcoll | 'w' >> WindowInto(SlidingWindows(period=2, size=4)) | GroupByKey() | reify_windows) expected = [('key @ [-2.0, 2.0)', [1]), ('key @ [0.0, 4.0)', [1, 2, 3]), ('key @ [2.0, 6.0)', [2, 3])] assert_that(result, equal_to(expected)) p.run()
def test_sessions(self): p = TestPipeline() pcoll = self.timestamped_key_values(p, 'key', 1, 2, 3, 20, 35, 27) result = (pcoll | 'w' >> WindowInto(Sessions(10)) | GroupByKey() | sort_values | reify_windows) expected = [('key @ [1.0, 13.0)', [1, 2, 3]), ('key @ [20.0, 45.0)', [20, 27, 35])] assert_that(result, equal_to(expected)) p.run()
def test_read_bzip2(self): _, lines = write_data(15) file_name = self._create_temp_file() with bz2.BZ2File(file_name, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> ReadFromText( file_name, compression_type=CompressionTypes.BZIP2) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_read_auto_single_file_gzip(self): _, lines = write_data(10) filename = tempfile.NamedTemporaryFile( delete=False, prefix=tempfile.template, suffix='.gz').name with gzip.GzipFile(filename, 'wb') as f: f.write('\n'.join(lines)) pipeline = TestPipeline() pcoll = pipeline | 'Read' >> beam.io.Read(LineSource( filename, compression_type=CompressionTypes.AUTO)) assert_that(pcoll, equal_to(lines)) pipeline.run()
def test_combine_globally_with_default_side_input(self): class CombineWithSideInput(PTransform): def expand(self, pcoll): side = pcoll | CombineGlobally(sum).as_singleton_view() main = pcoll.pipeline | Create([None]) return main | Map(lambda _, s: s, side) p = TestPipeline() result1 = p | 'i1' >> Create([]) | 'c1' >> CombineWithSideInput() result2 = p | 'i2' >> Create([1, 2, 3, 4]) | 'c2' >> CombineWithSideInput() assert_that(result1, equal_to([0]), label='r1') assert_that(result2, equal_to([10]), label='r2') p.run()