Esempio n. 1
0
  def test_param_windowed_value_coder(self):
    from apache_beam.transforms.window import IntervalWindow
    from apache_beam.utils.windowed_value import PaneInfo
    wv = windowed_value.create(
        b'',
        # Milliseconds to microseconds
        1000 * 1000,
        (IntervalWindow(11, 21),),
        PaneInfo(True, False, 1, 2, 3))
    windowed_value_coder = coders.WindowedValueCoder(
        coders.BytesCoder(), coders.IntervalWindowCoder())
    payload = windowed_value_coder.encode(wv)
    coder = coders.ParamWindowedValueCoder(
        payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()])

    # Test binary representation
    self.assertEqual(b'\x01',
                     coder.encode(window.GlobalWindows.windowed_value(1)))

    # Test unnested
    self.check_coder(
        coders.ParamWindowedValueCoder(
            payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]),
        windowed_value.WindowedValue(
            3,
            1,
            (window.IntervalWindow(11, 21),),
            PaneInfo(True, False, 1, 2, 3)),
        windowed_value.WindowedValue(
            1,
            1,
            (window.IntervalWindow(11, 21),),
            PaneInfo(True, False, 1, 2, 3)))

    # Test nested
    self.check_coder(
        coders.TupleCoder((
            coders.ParamWindowedValueCoder(
                payload, [
                    coders.FloatCoder(),
                    coders.IntervalWindowCoder()]),
            coders.ParamWindowedValueCoder(
                payload, [
                    coders.StrUtf8Coder(),
                    coders.IntervalWindowCoder()]))),
        (windowed_value.WindowedValue(
            1.5,
            1,
            (window.IntervalWindow(11, 21),),
            PaneInfo(True, False, 1, 2, 3)),
         windowed_value.WindowedValue(
             "abc",
             1,
             (window.IntervalWindow(11, 21),),
             PaneInfo(True, False, 1, 2, 3))))
Esempio n. 2
0
    def test_windowed_values_interpreted_correctly(self):
        windowed_value = WindowedValueHolder(
            WindowedValue('a', Timestamp(5),
                          [beam.window.IntervalWindow(5, 10)],
                          PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)))
        test_stream = (TestStream()
                       .advance_processing_time(10)
                       .advance_watermark_to(10)
                       .add_elements([windowed_value])
                       .advance_watermark_to_infinity())  # yapf: disable

        class RecordFn(beam.DoFn):
            def process(self,
                        element=beam.DoFn.ElementParam,
                        timestamp=beam.DoFn.TimestampParam,
                        window=beam.DoFn.WindowParam):
                yield (element, timestamp, window)

        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            my_record_fn = RecordFn()
            records = p | test_stream | beam.ParDo(my_record_fn)

            assert_that(
                records,
                equal_to([
                    ('a', timestamp.Timestamp(5),
                     beam.window.IntervalWindow(5, 10)),
                ]))
 def test_pane_info_formatter(self):
     self.assertEqual(
         'Pane 0: Final Early',
         pv.pane_info_formatter(
             PaneInfo(is_first=False,
                      is_last=True,
                      timing=PaneInfoTiming.EARLY,
                      index=0,
                      nonspeculative_index=0)))
Esempio n. 4
0
 def test_instance_check_windowed_value_holder(self):
     windowed_value = WindowedValue(
         'a', Timestamp(5), [beam.window.IntervalWindow(5, 10)],
         PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))
     self.assertTrue(
         isinstance(WindowedValueHolder(windowed_value),
                    WindowedValueHolder))
     self.assertTrue(
         isinstance(
             beam.Row(windowed_value=windowed_value,
                      urn=common_urns.coders.ROW.urn), WindowedValueHolder))
     self.assertFalse(
         isinstance(beam.Row(windowed_value=windowed_value),
                    WindowedValueHolder))
     self.assertFalse(isinstance(windowed_value, WindowedValueHolder))
     self.assertFalse(
         isinstance(beam.Row(x=windowed_value), WindowedValueHolder))
     self.assertFalse(
         isinstance(beam.Row(windowed_value=1), WindowedValueHolder))
    def test_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        # Count the occurrences of each word.
        counts = (p
                  | beam.Create(['to be or not to be that is the question'])
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that counts will be cached.
        ib.watch(locals())

        result = p.run()
        result.wait_until_finish()

        actual = list(result.get(counts))
        self.assertSetEqual(
            set(actual),
            set([
                ('or', 1),
                ('that', 1),
                ('be', 2),
                ('is', 1),
                ('question', 1),
                ('to', 2),
                ('the', 1),
                ('not', 1),
            ]))

        # Truncate the precision to millis because the window coder uses millis
        # as units then gets upcast to micros.
        end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
        df_counts = ib.collect(counts, include_window_info=True)
        df_expected = pd.DataFrame(
            {
                0: [e[0] for e in actual],
                1: [e[1] for e in actual],
                'event_time': [end_of_window for _ in actual],
                'windows': [[GlobalWindow()] for _ in actual],
                'pane_info': [
                    PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)
                    for _ in actual
                ]
            },
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])

        pd.testing.assert_frame_equal(df_expected, df_counts)

        actual_reified = result.get(counts, include_window_info=True)
        expected_reified = [
            WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()],
                          PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))
            for e in actual
        ]
        self.assertEqual(actual_reified, expected_reified)
    def test_streaming_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(),
                          options=StandardOptions(streaming=True))

        data = (
            p
            | TestStream()
                .advance_watermark_to(0)
                .advance_processing_time(1)
                .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
                .advance_watermark_to(20)
                .advance_processing_time(1)
                .add_elements(['that', 'is', 'the', 'question'])
            | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable

        counts = (data
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # Create a fake limiter that cancels the BCJ once the main job receives the
        # expected amount of results.
        class FakeLimiter:
            def __init__(self, p, pcoll):
                self.p = p
                self.pcoll = pcoll

            def is_triggered(self):
                result = ie.current_env().pipeline_result(self.p)
                if result:
                    try:
                        results = result.get(self.pcoll)
                    except ValueError:
                        return False
                    return len(results) >= 10
                return False

        # This sets the limiters to stop reading when the test receives 10 elements.
        ie.current_env().options.capture_control.set_limiters_for_test(
            [FakeLimiter(p, data)])

        # This tests that the data was correctly cached.
        pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
        expected_data_df = pd.DataFrame([
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('or', 0, [IntervalWindow(0, 10)], pane_info),
            ('not', 0, [IntervalWindow(0, 10)], pane_info),
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('that', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('is', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('the', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('question', 20000000, [IntervalWindow(20, 30)], pane_info)
        ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable

        data_df = ib.collect(data, include_window_info=True)
        pd.testing.assert_frame_equal(expected_data_df, data_df)

        # This tests that the windowing was passed correctly so that all the data
        # is aggregated also correctly.
        pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
        expected_counts_df = pd.DataFrame([
            ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable

        counts_df = ib.collect(counts, include_window_info=True)

        # The group by key has no guarantee of order. So we post-process the DF by
        # sorting so we can test equality.
        sorted_counts_df = (counts_df
                            .sort_values(['event_time', 0], ascending=True)
                            .reset_index(drop=True)) # yapf: disable
        pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
Esempio n. 7
0
    def test_streaming_wordcount(self):
        self.skipTest('[BEAM-9601] Test is breaking PreCommits')

        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)
        ib.options.capture_duration = timedelta(seconds=1)

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(),
                          options=StandardOptions(streaming=True))

        data = (
            p
            | TestStream()
                .advance_watermark_to(0)
                .advance_processing_time(1)
                .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
                .advance_watermark_to(20)
                .advance_processing_time(1)
                .add_elements(['that', 'is', 'the', 'question'])
            | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable

        counts = (data
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # This tests that the data was correctly cached.
        pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
        expected_data_df = pd.DataFrame(
            [('to', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('be', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('or', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('not', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('to', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('be', 0, [beam.window.IntervalWindow(0, 10)], pane_info),
             ('that', 20000000, [beam.window.IntervalWindow(20, 30)
                                 ], pane_info),
             ('is', 20000000, [beam.window.IntervalWindow(20, 30)], pane_info),
             ('the', 20000000, [beam.window.IntervalWindow(20, 30)
                                ], pane_info),
             ('question', 20000000, [beam.window.IntervalWindow(20, 30)
                                     ], pane_info)],
            columns=[0, 'event_time', 'windows', 'pane_info'])

        data_df = ib.collect(data, include_window_info=True)
        pd.testing.assert_frame_equal(expected_data_df, data_df)

        # This tests that the windowing was passed correctly so that all the data
        # is aggregated also correctly.
        pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
        expected_counts_df = pd.DataFrame([
            ('to', 2, 9999999, [beam.window.IntervalWindow(0, 10)], pane_info),
            ('be', 2, 9999999, [beam.window.IntervalWindow(0, 10)], pane_info),
            ('or', 1, 9999999, [beam.window.IntervalWindow(0, 10)], pane_info),
            ('not', 1, 9999999, [beam.window.IntervalWindow(0, 10)
                                 ], pane_info),
            ('that', 1, 29999999, [beam.window.IntervalWindow(20, 30)
                                   ], pane_info),
            ('is', 1, 29999999, [beam.window.IntervalWindow(20, 30)
                                 ], pane_info),
            ('the', 1, 29999999, [beam.window.IntervalWindow(20, 30)
                                  ], pane_info),
            ('question', 1, 29999999, [beam.window.IntervalWindow(20, 30)
                                       ], pane_info)
        ],
                                          columns=[
                                              0, 1, 'event_time', 'windows',
                                              'pane_info'
                                          ])

        counts_df = ib.collect(counts, include_window_info=True)
        pd.testing.assert_frame_equal(expected_counts_df, counts_df)
class StandardCodersTest(unittest.TestCase):

    _urn_to_json_value_parser = {
        'beam:coder:bytes:v1':
        lambda x: x.encode('utf-8'),
        'beam:coder:bool:v1':
        lambda x: x,
        'beam:coder:string_utf8:v1':
        lambda x: x,
        'beam:coder:varint:v1':
        lambda x: x,
        'beam:coder:kv:v1':
        lambda x, key_parser, value_parser:
        (key_parser(x['key']), value_parser(x['value'])),
        'beam:coder:interval_window:v1':
        lambda x: IntervalWindow(start=Timestamp(micros=(x['end'] - x['span'])
                                                 * 1000),
                                 end=Timestamp(micros=x['end'] * 1000)),
        'beam:coder:iterable:v1':
        lambda x, parser: list(map(parser, x)),
        'beam:coder:global_window:v1':
        lambda x: window.GlobalWindow(),
        'beam:coder:windowed_value:v1':
        lambda x, value_parser, window_parser: windowed_value.create(
            value_parser(x['value']), x['timestamp'] * 1000,
            tuple([window_parser(w) for w in x['windows']])),
        'beam:coder:param_windowed_value:v1':
        lambda x, value_parser, window_parser: windowed_value.create(
            value_parser(x['value']), x['timestamp'] * 1000,
            tuple([window_parser(w) for w in x['windows']]),
            PaneInfo(x['pane']['is_first'], x['pane']['is_last'],
                     PaneInfoTiming.from_string(x['pane']['timing']), x['pane']
                     ['index'], x['pane']['on_time_index'])),
        'beam:coder:timer:v1':
        lambda x, value_parser, window_parser: userstate.Timer(
            user_key=value_parser(x['userKey']),
            dynamic_timer_tag=x['dynamicTimerTag'],
            clear_bit=x['clearBit'],
            windows=tuple([window_parser(w) for w in x['windows']]),
            fire_timestamp=None,
            hold_timestamp=None,
            paneinfo=None)
        if x['clearBit'] else userstate.Timer(
            user_key=value_parser(x['userKey']),
            dynamic_timer_tag=x['dynamicTimerTag'],
            clear_bit=x['clearBit'],
            fire_timestamp=Timestamp(micros=x['fireTimestamp'] * 1000),
            hold_timestamp=Timestamp(micros=x['holdTimestamp'] * 1000),
            windows=tuple([window_parser(w) for w in x['windows']]),
            paneinfo=PaneInfo(x['pane']['is_first'], x['pane']['is_last'],
                              PaneInfoTiming.from_string(x['pane']['timing']),
                              x['pane']['index'], x['pane']['on_time_index'])),
        'beam:coder:double:v1':
        parse_float,
        'beam:coder:sharded_key:v1':
        lambda x, value_parser: ShardedKey(
            key=value_parser(x['key']), shard_id=x['shardId'].encode('utf-8')),
        'beam:coder:custom_window:v1':
        lambda x, window_parser: window_parser(x['window'])
    }

    def test_standard_coders(self):
        for name, spec in _load_test_cases(STANDARD_CODERS_YAML):
            logging.info('Executing %s test.', name)
            self._run_standard_coder(name, spec)

    def _run_standard_coder(self, name, spec):
        def assert_equal(actual, expected):
            """Handle nan values which self.assertEqual fails on."""
            if (isinstance(actual, float) and isinstance(expected, float)
                    and math.isnan(actual) and math.isnan(expected)):
                return
            self.assertEqual(actual, expected)

        coder = self.parse_coder(spec['coder'])
        parse_value = self.json_value_parser(spec['coder'])
        nested_list = [spec['nested']] if 'nested' in spec else [True, False]
        for nested in nested_list:
            for expected_encoded, json_value in spec['examples'].items():
                value = parse_value(json_value)
                expected_encoded = expected_encoded.encode('latin1')
                if not spec['coder'].get('non_deterministic', False):
                    actual_encoded = encode_nested(coder, value, nested)
                    if self.fix and actual_encoded != expected_encoded:
                        self.to_fix[spec['index'],
                                    expected_encoded] = actual_encoded
                    else:
                        self.assertEqual(expected_encoded, actual_encoded)
                        decoded = decode_nested(coder, expected_encoded,
                                                nested)
                        assert_equal(decoded, value)
                else:
                    # Only verify decoding for a non-deterministic coder
                    self.assertEqual(
                        decode_nested(coder, expected_encoded, nested), value)

    def parse_coder(self, spec):
        context = pipeline_context.PipelineContext()
        coder_id = str(hash(str(spec)))
        component_ids = [
            context.coders.get_id(self.parse_coder(c))
            for c in spec.get('components', ())
        ]
        context.coders.put_proto(
            coder_id,
            beam_runner_api_pb2.Coder(spec=beam_runner_api_pb2.FunctionSpec(
                urn=spec['urn'],
                payload=spec.get('payload', '').encode('latin1')),
                                      component_coder_ids=component_ids))
        return context.coders.get_by_id(coder_id)

    def json_value_parser(self, coder_spec):
        # TODO: integrate this with the logic for the other parsers
        if coder_spec['urn'] == 'beam:coder:row:v1':
            schema = schema_pb2.Schema.FromString(
                coder_spec['payload'].encode('latin1'))
            return value_parser_from_schema(schema)

        component_parsers = [
            self.json_value_parser(c)
            for c in coder_spec.get('components', ())
        ]
        return lambda x: self._urn_to_json_value_parser[coder_spec['urn']](
            x, *component_parsers)

    # Used when --fix is passed.

    fix = False
    to_fix = {}  # type: Dict[Tuple[int, bytes], bytes]

    @classmethod
    def tearDownClass(cls):
        if cls.fix and cls.to_fix:
            print("FIXING", len(cls.to_fix), "TESTS")
            doc_sep = '\n---\n'
            docs = open(STANDARD_CODERS_YAML).read().split(doc_sep)

            def quote(s):
                return json.dumps(s.decode('latin1')).replace(r'\u0000', r'\0')

            for (doc_ix,
                 expected_encoded), actual_encoded in cls.to_fix.items():
                print(quote(expected_encoded), "->", quote(actual_encoded))
                docs[doc_ix] = docs[doc_ix].replace(
                    quote(expected_encoded) + ':',
                    quote(actual_encoded) + ':')
            open(STANDARD_CODERS_YAML, 'w').write(doc_sep.join(docs))
Esempio n. 9
0
 def pane_info_formatter(self):
     PaneInfo(is_last=True, timing=PaneInfoTiming.EARLY)
     self.assertEqual('Pane Final EARLY')