def test_sharded_key_coder(self): key_and_coders = [(b'', b'\x00', coders.BytesCoder()), (b'key', b'\x03key', coders.BytesCoder()), ('key', b'\03\x6b\x65\x79', coders.StrUtf8Coder()), (('k', 1), b'\x01\x6b\x01', coders.TupleCoder( (coders.StrUtf8Coder(), coders.VarIntCoder())))] for key, bytes_repr, key_coder in key_and_coders: coder = coders.ShardedKeyCoder(key_coder) # Verify cloud object representation self.assertEqual( { '@type': 'kind:sharded_key', 'component_encodings': [key_coder.as_cloud_object()] }, coder.as_cloud_object()) self.assertEqual(b'\x00' + bytes_repr, coder.encode(ShardedKey(key, b''))) self.assertEqual(b'\x03123' + bytes_repr, coder.encode(ShardedKey(key, b'123'))) # Test unnested self.check_coder(coder, ShardedKey(key, b'')) self.check_coder(coder, ShardedKey(key, b'123')) for other_key, _, other_key_coder in key_and_coders: other_coder = coders.ShardedKeyCoder(other_key_coder) # Test nested self.check_coder( coders.TupleCoder((coder, other_coder)), (ShardedKey(key, b''), ShardedKey(other_key, b''))) self.check_coder( coders.TupleCoder((coder, other_coder)), (ShardedKey(key, b'123'), ShardedKey(other_key, b'')))
def __init__(self, file_name, # type: str range_tracker, # type: range_trackers.OffsetRangeTracker file_pattern, # type: str compression_type, # type: str allow_malformed_records, # type: bool representative_header_lines=None, # type: List[str] splittable_bgzf=False, # type: bool **kwargs # type: **str ): # type: (...) -> None # If `representative_header_lines` is given, header lines in `file_name` # are ignored; refer to _process_header_lines() logic. self._representative_header_lines = representative_header_lines self._file_name = file_name self._allow_malformed_records = allow_malformed_records if splittable_bgzf: text_source = bgzf.BGZFBlockSource( file_name, range_tracker, representative_header_lines, compression_type, header_processor_fns=( lambda x: not x.strip() or x.startswith('#'), self._process_header_lines), **kwargs) elif compression_type == filesystems.CompressionTypes.GZIP: text_source = bgzf.BGZFSource( file_pattern, 0, # min_bundle_size compression_type, True, # strip_trailing_newlines coders.StrUtf8Coder(), # coder validate=False, header_processor_fns=( lambda x: not x.strip() or x.startswith('#'), self._process_header_lines), **kwargs) else: text_source = textio._TextSource( file_pattern, 0, # min_bundle_size compression_type, True, # strip_trailing_newlines coders.StrUtf8Coder(), # coder validate=False, header_processor_fns=( lambda x: not x.strip() or x.startswith('#'), self._process_header_lines), **kwargs) self._text_lines = text_source.read_records(self._file_name, range_tracker)
def test_sharded_key_coder(self): key_and_coders = [(b'', b'\x00', coders.BytesCoder()), (b'key', b'\x03key', coders.BytesCoder()), ('key', b'\03\x6b\x65\x79', coders.StrUtf8Coder()), (('k', 1), b'\x01\x6b\x01', coders.TupleCoder( (coders.StrUtf8Coder(), coders.VarIntCoder())))] for key, bytes_repr, key_coder in key_and_coders: coder = coders.ShardedKeyCoder(key_coder) # Verify cloud object representation self.assertEqual( { '@type': 'kind:sharded_key', 'component_encodings': [key_coder.as_cloud_object()] }, coder.as_cloud_object()) # Test str repr self.assertEqual('%s' % coder, 'ShardedKeyCoder[%s]' % key_coder) self.assertEqual(b'\x00' + bytes_repr, coder.encode(ShardedKey(key, b''))) self.assertEqual(b'\x03123' + bytes_repr, coder.encode(ShardedKey(key, b'123'))) # Test unnested self.check_coder(coder, ShardedKey(key, b'')) self.check_coder(coder, ShardedKey(key, b'123')) # Test type hints self.assertTrue( isinstance(coder.to_type_hint(), sharded_key_type.ShardedKeyTypeConstraint)) key_type = coder.to_type_hint().key_type if isinstance(key_type, typehints.TupleConstraint): self.assertEqual(key_type.tuple_types, (type(key[0]), type(key[1]))) else: self.assertEqual(key_type, type(key)) self.assertEqual( coders.ShardedKeyCoder.from_type_hint( coder.to_type_hint(), typecoders.CoderRegistry()), coder) for other_key, _, other_key_coder in key_and_coders: other_coder = coders.ShardedKeyCoder(other_key_coder) # Test nested self.check_coder( coders.TupleCoder((coder, other_coder)), (ShardedKey(key, b''), ShardedKey(other_key, b''))) self.check_coder( coders.TupleCoder((coder, other_coder)), (ShardedKey(key, b'123'), ShardedKey(other_key, b'')))
def test_deduplication_with_event_time(self): deduplicate_duration = 60 with self.create_pipeline() as p: test_stream = (TestStream(coder=coders.StrUtf8Coder( )).with_output_types(str).advance_watermark_to(0).add_elements([ window.TimestampedValue('k1', 0), window.TimestampedValue('k2', 20), window.TimestampedValue('k3', 30) ]).advance_watermark_to(30).add_elements([ window.TimestampedValue('k1', 40), window.TimestampedValue('k2', 50), window.TimestampedValue('k3', 60) ]).advance_watermark_to(deduplicate_duration).add_elements( [window.TimestampedValue('k1', 70)]).advance_watermark_to_infinity()) res = (p | test_stream | deduplicate.Deduplicate( event_time_duration=Duration(deduplicate_duration)) | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) assert_that( res, equal_to([('k1', Timestamp(0)), ('k2', Timestamp(20)), ('k3', Timestamp(30)), ('k1', Timestamp(70))]))
def __init__(self, file_name, range_tracker, file_pattern, compression_type, allow_malformed_records, **kwargs): self._header_lines = [] self._last_record = None self._file_name = file_name self._allow_malformed_records = allow_malformed_records text_source = _TextSource( file_pattern, 0, # min_bundle_size compression_type, True, # strip_trailing_newlines coders.StrUtf8Coder(), # coder validate=False, header_processor_fns=(lambda x: x.startswith('#'), self._store_header_lines), **kwargs) self._text_lines = text_source.read_records( self._file_name, range_tracker) try: self._vcf_reader = vcf.Reader(fsock=self._create_generator()) except SyntaxError as e: raise ValueError('Invalid VCF header in %s: %s' % (self._file_name, str(e)))
def __init__(self, file_name, range_tracker, file_pattern, compression_type, allow_malformed_records, **kwargs): self._header_lines = [] self._last_record = None self._file_name = file_name self._allow_malformed_records = allow_malformed_records text_source = TextSource( file_pattern, 0, # min_bundle_size compression_type, True, # strip_trailing_newlines coders.StrUtf8Coder(), # coder validate=False, header_processor_fns=(lambda x: x.startswith('#'), self._store_header_lines), **kwargs) self._text_lines = text_source.read_records(self._file_name, range_tracker) try: self._vcf_reader = vcf.Reader(fsock=self._create_generator()) except SyntaxError as e: # Throw the exception inside the generator to ensure file is properly # closed (it's opened inside TextSource.read_records). self._text_lines.throw( ValueError('An exception was raised when reading header from VCF ' 'file %s: %s' % (self._file_name, traceback.format_exc(e))))
def test_map_coder(self): self.check_coder( coders.MapCoder(coders.VarIntCoder(), coders.StrUtf8Coder()), { 1: "one", 300: "three hundred" }, {}, {i: str(i) for i in range(5000)})
def test_tuple_coder(self): kv_coder = coders.TupleCoder( (coders.VarIntCoder(), coders.BytesCoder())) # Verify cloud object representation self.assertEqual( { '@type': 'kind:pair', 'is_pair_like': True, 'component_encodings': [ coders.VarIntCoder().as_cloud_object(), coders.BytesCoder().as_cloud_object() ], }, kv_coder.as_cloud_object()) # Test binary representation self.assertEqual('\x04abc', kv_coder.encode((4, 'abc'))) # Test unnested self.check_coder(kv_coder, (1, 'a'), (-2, 'a' * 100), (300, 'abc\0' * 5)) # Test nested self.check_coder( coders.TupleCoder((coders.TupleCoder( (coders.PickleCoder(), coders.VarIntCoder())), coders.StrUtf8Coder())), ((1, 2), 'a'), ((-2, 5), u'a\u0101' * 100), ((300, 1), 'abc\0' * 5))
def to_runner_api(self, context): # type: (PipelineContext) -> beam_runner_api_pb2.TimerFamilySpec return beam_runner_api_pb2.TimerFamilySpec( time_domain=TimeDomain.to_runner_api(self.time_domain), timer_family_coder_id=context.coders.get_id( coders._TimerCoder(coders.StrUtf8Coder(), coders.GlobalWindowCoder())))
def __init__( self, min_bundle_size=0, desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE, compression_type=CompressionTypes.AUTO, strip_trailing_newlines=True, coder=coders.StrUtf8Coder(), # type: coders.Coder skip_header_lines=0, with_filename=False, delimiter=None, escapechar=None, **kwargs): """Initialize the ``ReadAllFromText`` transform. Args: min_bundle_size: Minimum size of bundles that should be generated when splitting this source into bundles. See ``FileBasedSource`` for more details. desired_bundle_size: Desired size of bundles that should be generated when splitting this source into bundles. See ``FileBasedSource`` for more details. compression_type: Used to handle compressed input files. Typical value is ``CompressionTypes.AUTO``, in which case the underlying file_path's extension will be used to detect the compression. strip_trailing_newlines: Indicates whether this source should remove the newline char in each line it reads before decoding that line. validate: flag to verify that the files exist during the pipeline creation time. skip_header_lines: Number of header lines to skip. Same number is skipped from each source file. Must be 0 or higher. Large number of skipped lines might impact performance. coder: Coder used to decode each line. with_filename: If True, returns a Key Value with the key being the file name and the value being the actual data. If False, it only returns the data. delimiter (bytes) Optional: delimiter to split records. Must not self-overlap, because self-overlapping delimiters cause ambiguous parsing. escapechar (bytes) Optional: a single byte to escape the records delimiter, can also escape itself. """ super().__init__(**kwargs) source_from_file = partial( _create_text_source, min_bundle_size=min_bundle_size, compression_type=compression_type, strip_trailing_newlines=strip_trailing_newlines, coder=coder, skip_header_lines=skip_header_lines, delimiter=delimiter, escapechar=escapechar) self._desired_bundle_size = desired_bundle_size self._min_bundle_size = min_bundle_size self._compression_type = compression_type self._read_all_files = ReadAllFiles(True, compression_type, desired_bundle_size, min_bundle_size, source_from_file, with_filename)
def test_param_windowed_value_coder(self): from apache_beam.transforms.window import IntervalWindow from apache_beam.utils.windowed_value import PaneInfo wv = windowed_value.create( b'', # Milliseconds to microseconds 1000 * 1000, (IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)) windowed_value_coder = coders.WindowedValueCoder( coders.BytesCoder(), coders.IntervalWindowCoder()) payload = windowed_value_coder.encode(wv) coder = coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]) # Test binary representation self.assertEqual(b'\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test unnested self.check_coder( coders.ParamWindowedValueCoder( payload, [coders.VarIntCoder(), coders.IntervalWindowCoder()]), windowed_value.WindowedValue( 3, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( 1, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))) # Test nested self.check_coder( coders.TupleCoder(( coders.ParamWindowedValueCoder( payload, [ coders.FloatCoder(), coders.IntervalWindowCoder()]), coders.ParamWindowedValueCoder( payload, [ coders.StrUtf8Coder(), coders.IntervalWindowCoder()]))), (windowed_value.WindowedValue( 1.5, 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3)), windowed_value.WindowedValue( "abc", 1, (window.IntervalWindow(11, 21),), PaneInfo(True, False, 1, 2, 3))))
def __init__(self, file_pattern, min_bundle_size=0, compression_type=filesystem.CompressionTypes.AUTO, strip_trailing_newlines=True, coder=coders.StrUtf8Coder(), buffer_size=DEFAULT_READ_BUFFER_SIZE, validate=True): super(self.__class__, self).__init__(file_pattern, min_bundle_size, compression_type, strip_trailing_newlines, coder, buffer_size, validate, 1)
def __init__( self, file_pattern=None, min_bundle_size=0, compression_type=CompressionTypes.AUTO, strip_trailing_newlines=True, coder=coders.StrUtf8Coder(), # type: coders.Coder validate=True, skip_header_lines=0, delimiter=None, escapechar=None, **kwargs): """Initialize the :class:`ReadFromText` transform. Args: file_pattern (str): The file path to read from as a local file path or a GCS ``gs://`` path. The path can contain glob characters (``*``, ``?``, and ``[...]`` sets). min_bundle_size (int): Minimum size of bundles that should be generated when splitting this source into bundles. See :class:`~apache_beam.io.filebasedsource.FileBasedSource` for more details. compression_type (str): Used to handle compressed input files. Typical value is :attr:`CompressionTypes.AUTO <apache_beam.io.filesystem.CompressionTypes.AUTO>`, in which case the underlying file_path's extension will be used to detect the compression. strip_trailing_newlines (bool): Indicates whether this source should remove the newline char in each line it reads before decoding that line. validate (bool): flag to verify that the files exist during the pipeline creation time. skip_header_lines (int): Number of header lines to skip. Same number is skipped from each source file. Must be 0 or higher. Large number of skipped lines might impact performance. coder (~apache_beam.coders.coders.Coder): Coder used to decode each line. delimiter (bytes) Optional: delimiter to split records. Must not self-overlap, because self-overlapping delimiters cause ambiguous parsing. escapechar (bytes) Optional: a single byte to escape the records delimiter, can also escape itself. """ super().__init__(**kwargs) self._source = self._source_class(file_pattern, min_bundle_size, compression_type, strip_trailing_newlines, coder, validate=validate, skip_header_lines=skip_header_lines, delimiter=delimiter, escapechar=escapechar)
def test_map_coder(self): values = [ { 1: "one", 300: "three hundred" }, # force yapf to be nice {}, {i: str(i) for i in range(5000)} ] map_coder = coders.MapCoder(coders.VarIntCoder(), coders.StrUtf8Coder()) self.check_coder(map_coder, *values) self.check_coder(map_coder.as_deterministic_coder("label"), *values)
def __init__( self, min_bundle_size=0, desired_bundle_size=DEFAULT_DESIRED_BUNDLE_SIZE, compression_type=CompressionTypes.AUTO, strip_trailing_newlines=True, coder=coders.StrUtf8Coder(), # type: coders.Coder skip_header_lines=0, **kwargs): """Initialize the ``ReadAllFromText`` transform. Args: min_bundle_size: Minimum size of bundles that should be generated when splitting this source into bundles. See ``FileBasedSource`` for more details. desired_bundle_size: Desired size of bundles that should be generated when splitting this source into bundles. See ``FileBasedSource`` for more details. compression_type: Used to handle compressed input files. Typical value is ``CompressionTypes.AUTO``, in which case the underlying file_path's extension will be used to detect the compression. strip_trailing_newlines: Indicates whether this source should remove the newline char in each line it reads before decoding that line. validate: flag to verify that the files exist during the pipeline creation time. skip_header_lines: Number of header lines to skip. Same number is skipped from each source file. Must be 0 or higher. Large number of skipped lines might impact performance. coder: Coder used to decode each line. """ super(ReadAllFromText, self).__init__(**kwargs) source_from_file = partial( _create_text_source, min_bundle_size=min_bundle_size, compression_type=compression_type, strip_trailing_newlines=strip_trailing_newlines, coder=coder, skip_header_lines=skip_header_lines) self._desired_bundle_size = desired_bundle_size self._min_bundle_size = min_bundle_size self._compression_type = compression_type self._read_all_files = ReadAllFiles( True, compression_type, desired_bundle_size, min_bundle_size, source_from_file)
def test_windowed_value_coder(self): coder = coders.WindowedValueCoder( coders.VarIntCoder(), coders.GlobalWindowCoder()) # Verify cloud object representation self.assertEqual({ '@type': 'kind:windowed_value', 'is_wrapper': True, 'component_encodings': [ coders.VarIntCoder().as_cloud_object(), coders.GlobalWindowCoder().as_cloud_object(), ], }, coder.as_cloud_object()) # Test binary representation self.assertEqual( b'\x7f\xdf;dZ\x1c\xac\t\x00\x00\x00\x01\x0f\x01', coder.encode(window.GlobalWindows.windowed_value(1))) # Test decoding large timestamp self.assertEqual( coder.decode(b'\x7f\xdf;dZ\x1c\xac\x08\x00\x00\x00\x01\x0f\x00'), windowed_value.create(0, MIN_TIMESTAMP.micros, (GlobalWindow(), ))) # Test unnested self.check_coder( coders.WindowedValueCoder(coders.VarIntCoder()), windowed_value.WindowedValue(3, -100, ()), windowed_value.WindowedValue(-1, 100, (1, 2, 3))) # Test Global Window self.check_coder( coders.WindowedValueCoder( coders.VarIntCoder(), coders.GlobalWindowCoder()), window.GlobalWindows.windowed_value(1)) # Test nested self.check_coder( coders.TupleCoder(( coders.WindowedValueCoder(coders.FloatCoder()), coders.WindowedValueCoder(coders.StrUtf8Coder()))), ( windowed_value.WindowedValue(1.5, 0, ()), windowed_value.WindowedValue("abc", 10, ('window', ))))
def test_timer_coder(self): self.check_coder( coders._TimerCoder(coders.StrUtf8Coder(), coders.GlobalWindowCoder()), *[ userstate.Timer(user_key="key", dynamic_timer_tag="tag", windows=(GlobalWindow(), ), clear_bit=True, fire_timestamp=None, hold_timestamp=None, paneinfo=None), userstate.Timer(user_key="key", dynamic_timer_tag="tag", windows=(GlobalWindow(), ), clear_bit=False, fire_timestamp=timestamp.Timestamp.of(123), hold_timestamp=timestamp.Timestamp.of(456), paneinfo=windowed_value.PANE_INFO_UNKNOWN) ])
def __init__(self, file_name, block, header_lines, compression_type, header_processor_fns, strip_trailing_newlines=True, min_bundle_size=0, coder=coders.StrUtf8Coder(), validate=True): """A source for reading BGZF Block.""" super(BGZFBlockSource, self).__init__(file_name, min_bundle_size, compression_type, strip_trailing_newlines, coder, validate=validate, header_processor_fns=header_processor_fns) self._block = block self._header_lines = header_lines
def test_deduplication_in_different_windows(self): with self.create_pipeline() as p: test_stream = ( TestStream( coder=coders.StrUtf8Coder()).advance_watermark_to(0).add_elements( [ window.TimestampedValue('k1', 0), window.TimestampedValue('k2', 10), window.TimestampedValue('k3', 20), window.TimestampedValue('k1', 30), window.TimestampedValue('k2', 40), window.TimestampedValue('k3', 50), window.TimestampedValue('k4', 60), window.TimestampedValue('k5', 70), window.TimestampedValue('k6', 80) ]).advance_watermark_to_infinity()) res = ( p | test_stream | beam.WindowInto(window.FixedWindows(30)) | deduplicate.Deduplicate(processing_time_duration=10 * 60) | beam.Map(lambda e, ts=beam.DoFn.TimestampParam: (e, ts))) # Deduplication should happen per window. expect_unique_keys_per_window = { window.IntervalWindow(0, 30): [('k1', Timestamp(0)), ('k2', Timestamp(10)), ('k3', Timestamp(20))], window.IntervalWindow(30, 60): [('k1', Timestamp(30)), ('k2', Timestamp(40)), ('k3', Timestamp(50))], window.IntervalWindow(60, 90): [('k4', Timestamp(60)), ('k5', Timestamp(70)), ('k6', Timestamp(80))], } assert_that( res, equal_to_per_window(expect_unique_keys_per_window), use_global_window=False, label='assert per window')
def __init__(self, file_pattern=None, min_bundle_size=0, compression_type=CompressionTypes.AUTO, strip_trailing_newlines=True, coder=coders.StrUtf8Coder(), validate=True, skip_header_lines=0, **kwargs): """Initialize the ``ReadFromText`` transform. Args: file_pattern: The file path to read from as a local file path or a GCS ``gs://`` path. The path can contain glob characters ``(*, ?, and [...] sets)``. min_bundle_size: Minimum size of bundles that should be generated when splitting this source into bundles. See ``FileBasedSource`` for more details. compression_type: Used to handle compressed input files. Typical value is ``CompressionTypes.AUTO``, in which case the underlying file_path's extension will be used to detect the compression. strip_trailing_newlines: Indicates whether this source should remove the newline char in each line it reads before decoding that line. validate: flag to verify that the files exist during the pipeline creation time. skip_header_lines: Number of header lines to skip. Same number is skipped from each source file. Must be 0 or higher. Large number of skipped lines might impact performance. coder: Coder used to decode each line. """ super(ReadFromText, self).__init__(**kwargs) self._source = _TextSource(file_pattern, min_bundle_size, compression_type, strip_trailing_newlines, coder, validate=validate, skip_header_lines=skip_header_lines)
def test_nested_observables(self): class FakeObservableIterator(observable.ObservableMixin): def __iter__(self): return iter([1, 2, 3]) # Coder for elements from the observable iterator. elem_coder = coders.VarIntCoder() iter_coder = coders.TupleSequenceCoder(elem_coder) # Test nested WindowedValue observable. coder = coders.WindowedValueCoder(iter_coder) observ = FakeObservableIterator() value = windowed_value.WindowedValue(observ, 0, ()) self.assertEqual( coder.get_impl().get_estimated_size_and_observables(value)[1], [(observ, elem_coder.get_impl())]) # Test nested tuple observable. coder = coders.TupleCoder((coders.StrUtf8Coder(), iter_coder)) value = (u'123', observ) self.assertEqual( coder.get_impl().get_estimated_size_and_observables(value)[1], [(observ, elem_coder.get_impl())])
def __init__( self, file_name, # type: str range_tracker, # type: range_trackers.OffsetRangeTracker file_pattern, # type: str compression_type, # type: str allow_malformed_records, # type: bool representative_header_lines=None, # type: List[str] **kwargs # type: **str ): # type: (...) -> None # If `representative_header_lines` is given, header lines in `file_name` # are ignored. self._header_lines = [] self._representative_header_lines = representative_header_lines self._last_record = None self._file_name = file_name self._allow_malformed_records = allow_malformed_records text_source = textio._TextSource( file_pattern, 0, # min_bundle_size compression_type, True, # strip_trailing_newlines coders.StrUtf8Coder(), # coder validate=False, header_processor_fns=(lambda x: x.startswith('#'), self._store_header_lines), **kwargs) self._text_lines = text_source.read_records( self._file_name, range_tracker) try: self._vcf_reader = vcf.Reader(fsock=self._create_generator()) except SyntaxError as e: raise ValueError('Invalid VCF header in %s: %s' % (self._file_name, str(e)))
def test_utf8_coder(self): self.check_coder(coders.StrUtf8Coder(), 'a', u'ab\u00FF', u'\u0101\0')