def test_non_liftable_combine(self): test_options = PipelineOptions(flags=['--allow_unsafe_triggers']) run_combine(TestPipeline(runner=self.runner(), options=test_options), lift_combiners=False) self._assert_teardown_called()
def test_compute_top_sessions(self): with TestPipeline() as p: edits = p | beam.Create(self.EDITS) result = edits | top_wikipedia_sessions.ComputeTopSessions(1.0) assert_that(result, equal_to(self.EXPECTED))
def test_forwards_batch_args(self): examples = list(range(100)) with TestPipeline() as pipeline: pcoll = pipeline | 'start' >> beam.Create(examples) actual = pcoll | base.RunInference(FakeModelHandlerNeedsBigBatch()) assert_that(actual, equal_to(examples), label='assert:inferences')
def test_singleton(self): with TestPipeline() as p: data = [389] pc = p | Create(data) qunatiles = pc | beam.ApproximateQuantiles.Globally(5) assert_that(qunatiles, equal_to([[389, 389, 389, 389, 389]]))
def setUp(self): self.pipeline = TestPipeline(is_integration_test=True) self.inputOptions = json.loads(self.pipeline.get_option('input_options'))
def test_combine_globally_with_default(self): with TestPipeline() as p: assert_that(p | Create([]) | CombineGlobally(sum), equal_to([0]))
def test_globally_empty(self): l = [] with TestPipeline() as p: pc = p | Create(l) | Map(lambda x: x) latest = pc | combine.Latest.Globally() assert_that(latest, equal_to([None]))
def test_tostring_iterables_with_delimeter(self): with TestPipeline() as p: data = [("one", "two", "three"), ("four", "five", "six")] result = (p | beam.Create(data) | util.ToString.Iterables("\t")) assert_that(result, equal_to(["one\ttwo\tthree", "four\tfive\tsix"]))
def test_tostring_kvs(self): with TestPipeline() as p: result = (p | beam.Create([("one", 1), ("two", 2)]) | util.ToString.Kvs()) assert_that(result, equal_to(["one,1", "two,2"]))
def test_tostring_elements(self): with TestPipeline() as p: result = (p | beam.Create([1, 1, 2, 3]) | util.ToString.Element()) assert_that(result, equal_to(["1", "1", "2", "3"]))
def test_tostring_iterables(self): with TestPipeline() as p: result = (p | beam.Create([("one", "two", "three"), ("four", "five", "six")]) | util.ToString.Iterables()) assert_that(result, equal_to(["one,two,three", "four,five,six"]))
def test_callable_k(self): with TestPipeline() as p: pc = p | beam.Create(self.l) with_keys = pc | util.WithKeys(lambda x: x * x) assert_that(with_keys, equal_to([(1, 1), (4, 2), (9, 3)]))
def test_reshuffle_contents_unchanged(self): pipeline = TestPipeline() data = [(1, 1), (2, 1), (3, 1), (1, 2), (2, 2), (1, 3)] result = (pipeline | beam.Create(data) | beam.Reshuffle()) assert_that(result, equal_to(data)) pipeline.run()
def test_combining_value_state(self): run_pardo(TestPipeline(runner=self.runner())) self._assert_teardown_called()
def test_create_transform(self): with TestPipeline() as p: assert_that(p | Create(range(10)), equal_to(range(10)))
def test_tostring_kvs_delimeter(self): with TestPipeline() as p: result = (p | beam.Create([("one", 1), ("two", 2)]) | util.ToString.Kvs("\t")) assert_that(result, equal_to(["one\t1", "two\t2"]))
def test_empty_global_top(self): with TestPipeline() as p: assert_that(p | beam.Create([]) | combine.Top.Largest(10), equal_to([[]]))
def test_leader_board_users(self): test_options = PipelineOptions(flags=['--allow_unsafe_triggers']) with TestPipeline(options=test_options) as p: result = (self.create_data(p) | leader_board.CalculateUserScores(allowed_lateness=120)) assert_that(result, equal_to([]))
def test_combine_globally_without_default(self): with TestPipeline() as p: result = p | Create([]) | CombineGlobally(sum).without_defaults() assert_that(result, equal_to([]))
def test_create_singleton_pcollection(self): with TestPipeline() as pipeline: pcoll = pipeline | 'label' >> Create([[1, 2, 3]]) assert_that(pcoll, equal_to([[1, 2, 3]]))
def test_per_key_empty(self): l = [] with TestPipeline() as p: pc = p | Create(l) | Map(lambda x: x) latest = pc | combine.Latest.PerKey() assert_that(latest, equal_to([]))
def test_fake_read(self): with TestPipeline() as pipeline: pcoll = pipeline | 'read' >> Read(FakeSource([1, 2, 3])) assert_that(pcoll, equal_to([1, 2, 3]))
def test_log_distribution(self): with TestPipeline() as p: data = [int(math.log(x)) for x in range(1, 1000)] pc = p | Create(data) quantiles = pc | beam.ApproximateQuantiles.Globally(5) assert_that(quantiles, equal_to([[0, 5, 6, 6, 6]]))
def test_apply_custom_transform(self): with TestPipeline() as pipeline: pcoll = pipeline | 'pcoll' >> Create([1, 2, 3]) result = pcoll | PipelineTest.CustomTransform() assert_that(result, equal_to([2, 3, 4]))
def test_output(): options = PipelineOptions() options.view_as(StandardOptions).streaming = True test_pipeline = TestPipeline(options=options) events = (test_pipeline | TestStream().add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 1, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 2, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 3, 0, tzinfo=pytz.UTC).timestamp()). add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 4, 0, tzinfo=pytz.UTC).timestamp()).advance_watermark_to( datetime( 2021, 3, 1, 0, 0, 5, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 5, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 6, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 7, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 8, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 9, 0, tzinfo=pytz.UTC).timestamp()). advance_watermark_to( datetime( 2021, 3, 1, 0, 0, 10, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 10, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 11, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 12, 0, tzinfo=pytz.UTC). timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 13, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 14, 0, tzinfo=pytz.UTC).timestamp()). advance_watermark_to( datetime( 2021, 3, 1, 0, 0, 15, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 15, 0, tzinfo=pytz.UTC).timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 16, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 17, 0, tzinfo=pytz.UTC). timestamp()).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 18, 0, tzinfo=pytz.UTC).timestamp()). add_elements(elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 19, 0, tzinfo=pytz.UTC).timestamp()). advance_watermark_to( datetime(2021, 3, 1, 0, 0, 20, 0, tzinfo=pytz.UTC).timestamp() ).add_elements( elements=["event"], event_timestamp=datetime( 2021, 3, 1, 0, 0, 20, 0, tzinfo=pytz.UTC).timestamp()).advance_watermark_to( datetime(2021, 3, 1, 0, 0, 25, 0, tzinfo=pytz.UTC). timestamp()).advance_watermark_to_infinity()) results = apply_transform(events) answers = { window.IntervalWindow( datetime(2021, 3, 1, 0, 0, 0, 0, tzinfo=pytz.UTC).timestamp(), datetime(2021, 3, 1, 0, 0, 5, 0, tzinfo=pytz.UTC).timestamp()): [4], window.IntervalWindow( datetime(2021, 3, 1, 0, 0, 5, 0, tzinfo=pytz.UTC).timestamp(), datetime(2021, 3, 1, 0, 0, 10, 0, tzinfo=pytz.UTC).timestamp()): [5], window.IntervalWindow( datetime(2021, 3, 1, 0, 0, 10, 0, tzinfo=pytz.UTC).timestamp(), datetime(2021, 3, 1, 0, 0, 15, 0, tzinfo=pytz.UTC).timestamp()): [5], window.IntervalWindow( datetime(2021, 3, 1, 0, 0, 15, 0, tzinfo=pytz.UTC).timestamp(), datetime(2021, 3, 1, 0, 0, 20, 0, tzinfo=pytz.UTC).timestamp()): [5], window.IntervalWindow( datetime(2021, 3, 1, 0, 0, 20, 0, tzinfo=pytz.UTC).timestamp(), datetime(2021, 3, 1, 0, 0, 25, 0, tzinfo=pytz.UTC).timestamp()): [1], } assert_that(results, equal_to_per_window(answers), label='count assert per window') test_pipeline.run()
def test_ptransform_override_multiple_outputs(self): class MultiOutputComposite(PTransform): def __init__(self): self.output_tags = set() def expand(self, pcoll): def mux_input(x): x = x * 2 if isinstance(x, int): yield TaggedOutput('numbers', x) else: yield TaggedOutput('letters', x) multi = pcoll | 'MyReplacement' >> beam.ParDo(mux_input).with_outputs() letters = multi.letters | 'LettersComposite' >> beam.Map( lambda x: x * 3) numbers = multi.numbers | 'NumbersComposite' >> beam.Map( lambda x: x * 5) return { 'letters': letters, 'numbers': numbers, } class MultiOutputOverride(PTransformOverride): def matches(self, applied_ptransform): return applied_ptransform.full_label == 'MyMultiOutput' def get_replacement_transform(self, ptransform): return MultiOutputComposite() def mux_input(x): if isinstance(x, int): yield TaggedOutput('numbers', x) else: yield TaggedOutput('letters', x) with TestPipeline() as p: multi = ( p | beam.Create([1, 2, 3, 'a', 'b', 'c']) | 'MyMultiOutput' >> beam.ParDo(mux_input).with_outputs()) letters = multi.letters | 'MyLetters' >> beam.Map(lambda x: x) numbers = multi.numbers | 'MyNumbers' >> beam.Map(lambda x: x) # Assert that the PCollection replacement worked correctly and that # elements are flowing through. The replacement transform first # multiples by 2 then the leaf nodes inside the composite multiply by # an additional 3 and 5. Use prime numbers to ensure that each # transform is getting executed once. assert_that( letters, equal_to(['a' * 2 * 3, 'b' * 2 * 3, 'c' * 2 * 3]), label='assert letters') assert_that( numbers, equal_to([1 * 2 * 5, 2 * 2 * 5, 3 * 2 * 5]), label='assert numbers') # Do the replacement and run the element assertions. p.replace_all([MultiOutputOverride()]) # The following checks the graph to make sure the replacement occurred. visitor = PipelineTest.Visitor(visited=[]) p.visit(visitor) pcollections = visitor.visited composites = visitor.enter_composite # Assert the replacement is in the composite list and retrieve the # AppliedPTransform. self.assertIn( MultiOutputComposite, [t.transform.__class__ for t in composites]) multi_output_composite = list( filter( lambda t: t.transform.__class__ == MultiOutputComposite, composites))[0] # Assert that all of the replacement PCollections are in the graph. for output in multi_output_composite.outputs.values(): self.assertIn(output, pcollections) # Assert that all of the "old"/replaced PCollections are not in the graph. self.assertNotIn(multi[None], visitor.visited) self.assertNotIn(multi.letters, visitor.visited) self.assertNotIn(multi.numbers, visitor.visited)
def test_multiple_outputs_with_watermark_advancement(self): """Tests that the TestStream can independently control output watermarks.""" # Purposely set the watermark of numbers to 20 then letters to 5 to test # that the watermark advancement is per PCollection. # # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be # emitted at different times so that they will have different windows. The # watermark advancement is checked by checking their windows. If the # watermark does not advance, then the windows will be [-inf, -inf). If the # windows do not advance separately, then the PCollections will both # windowed in [15, 30). letters_elements = [ TimestampedValue('a', 6), TimestampedValue('b', 7), TimestampedValue('c', 8), ] numbers_elements = [ TimestampedValue('1', 21), TimestampedValue('2', 22), TimestampedValue('3', 23), ] test_stream = (TestStream().advance_watermark_to( 0, tag='letters').advance_watermark_to( 0, tag='numbers').advance_watermark_to( 20, tag='numbers').advance_watermark_to( 5, tag='letters').add_elements( letters_elements, tag='letters').advance_watermark_to( 10, tag='letters').add_elements( numbers_elements, tag='numbers').advance_watermark_to( 30, tag='numbers')) options = StandardOptions(streaming=True) p = TestPipeline(options=options) main = p | test_stream # Use an AfterWatermark trigger with an early firing to test that the # watermark is advancing properly and that the element is being emitted in # the correct window. letters = ( main['letters'] | 'letter windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey()) numbers = ( main['numbers'] | 'number windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'number with key' >> beam.Map(lambda x: ('k', x)) | 'number gbk' >> beam.GroupByKey()) # The letters were emitted when the watermark was at 5, thus we expect to # see the elements in the [0, 15) window. We used an early trigger to make # sure that the ON_TIME empty pane was also emitted with a TestStream. # This pane has no data because of the early trigger causes the elements to # fire before the end of the window and because the accumulation mode # discards any data after the trigger fired. expected_letters = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', []), ], } # Same here, except the numbers were emitted at watermark = 20, thus they # are in the [15, 30) window. expected_numbers = { window.IntervalWindow(15, 30): [ ('k', ['1', '2', '3']), ('k', []), ], } assert_that(letters, equal_to_per_window(expected_letters), label='letters assert per window') assert_that(numbers, equal_to_per_window(expected_numbers), label='numbers assert per window') p.run()
def test_timestamp_param_map(self): with TestPipeline() as p: assert_that( p | Create([1, 2]) | beam.Map(lambda _, t=DoFn.TimestampParam: t), equal_to([MIN_TIMESTAMP, MIN_TIMESTAMP]))
def test_constant_k(self): with TestPipeline() as p: pc = p | beam.Create(self.l) with_keys = pc | util.WithKeys('k') assert_that(with_keys, equal_to([('k', 1), ('k', 2), ('k', 3)], ))
def test_combine(self): run_combine(TestPipeline(runner=self.runner())) self._assert_teardown_called()