def test_seek_set(self): for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) reference_fd = StringIO(self.content) # Note: content (readline) check must come before position (tell) check # because cStringIO's tell() reports out of bound positions (if we seek # beyond the file) up until a real read occurs. # _CompressedFile.tell() always stays within the bounds of the # uncompressed content. for seek_position in (-1, 0, 1, len(self.content)-1, len(self.content), len(self.content) + 1): compressed_fd.seek(seek_position, os.SEEK_SET) reference_fd.seek(seek_position, os.SEEK_SET) uncompressed_line = compressed_fd.readline() reference_line = reference_fd.readline() self.assertEqual(uncompressed_line, reference_line) uncompressed_position = compressed_fd.tell() reference_position = reference_fd.tell() self.assertEqual(uncompressed_position, reference_position)
def test_seek_set(self): for compression_type in [CompressionTypes.BZIP2, CompressionTypes.DEFLATE, CompressionTypes.GZIP]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) reference_fd = BytesIO(self.content) # Note: BytesIO's tell() reports out of bound positions (if we seek # beyond the file), therefore we need to cap it to max_position # _CompressedFile.tell() always stays within the bounds of the # uncompressed content. # Negative seek position argument is not supported for BytesIO with # whence set to SEEK_SET. for seek_position in (0, 1, len(self.content)-1, len(self.content), len(self.content) + 1): compressed_fd.seek(seek_position, os.SEEK_SET) reference_fd.seek(seek_position, os.SEEK_SET) uncompressed_line = compressed_fd.readline() reference_line = reference_fd.readline() self.assertEqual(uncompressed_line, reference_line) uncompressed_position = compressed_fd.tell() reference_position = reference_fd.tell() max_position = len(self.content) reference_position = min(reference_position, max_position) self.assertEqual(uncompressed_position, reference_position)
def test_seek_cur(self): for compression_type in [CompressionTypes.BZIP2, CompressionTypes.DEFLATE, CompressionTypes.GZIP]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) reference_fd = BytesIO(self.content) # Test out of bound, inbound seeking in both directions # Note: BytesIO's seek() reports out of bound positions (if we seek # beyond the file), therefore we need to cap it to max_position (to # make it consistent with the old StringIO behavior for seek_position in (-1, 0, 1, len(self.content) // 2, len(self.content) // 2, -1 * len(self.content) // 2): compressed_fd.seek(seek_position, os.SEEK_CUR) reference_fd.seek(seek_position, os.SEEK_CUR) uncompressed_line = compressed_fd.readline() expected_line = reference_fd.readline() self.assertEqual(uncompressed_line, expected_line) reference_position = reference_fd.tell() uncompressed_position = compressed_fd.tell() max_position = len(self.content) reference_position = min(reference_position, max_position) reference_fd.seek(reference_position, os.SEEK_SET) self.assertEqual(uncompressed_position, reference_position)
def test_read_and_seek_back_to_beginning(self): for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) first_pass = compressed_fd.readline() compressed_fd.seek(0, os.SEEK_SET) second_pass = compressed_fd.readline() self.assertEqual(first_pass, second_pass)
def test_read_from_end_returns_no_data(self): for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) seek_position = 0 compressed_fd.seek(seek_position, os.SEEK_END) expected_data = '' uncompressed_data = compressed_fd.read(10) self.assertEqual(uncompressed_data, expected_data)
def test_seek_outside(self): for compression_type in [ CompressionTypes.BZIP2, CompressionTypes.DEFLATE, CompressionTypes.GZIP ]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) for whence in (os.SEEK_CUR, os.SEEK_SET, os.SEEK_END): seek_position = -1 * len(self.content) - 10 compressed_fd.seek(seek_position, whence) expected_position = 0 uncompressed_position = compressed_fd.tell() self.assertEqual(uncompressed_position, expected_position) seek_position = len(self.content) + 20 compressed_fd.seek(seek_position, whence) expected_position = len(self.content) uncompressed_position = compressed_fd.tell() self.assertEqual(uncompressed_position, expected_position)
def _add_compression(stream, path, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning('Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) if compression_type != CompressionTypes.UNCOMPRESSED: return CompressedFile(stream) return stream
def _path_open(self, path, mode, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """Helper functions to open a file in the provided mode. """ compression_type = FileSystem._get_compression_type(path, compression_type) mime_type = CompressionTypes.mime_type(compression_type, mime_type) raw_file = s3io.S3IO().open(path, mode, mime_type=mime_type) if compression_type == CompressionTypes.UNCOMPRESSED: return raw_file return CompressedFile(raw_file, compression_type=compression_type)
def _open_hdfs(self, path, mode, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning( 'Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) res = self._hdfs_client.open(path, mode) if compression_type != CompressionTypes.UNCOMPRESSED: res = CompressedFile(res) return res
def test_seek_cur(self): for compression_type in [ CompressionTypes.BZIP2, CompressionTypes.GZIP ]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) reference_fd = StringIO(self.content) # Test out of bound, inbound seeking in both directions for seek_position in (-1, 0, 1, len(self.content) / 2, len(self.content) / 2, -1 * len(self.content) / 2): compressed_fd.seek(seek_position, os.SEEK_CUR) reference_fd.seek(seek_position, os.SEEK_CUR) uncompressed_line = compressed_fd.readline() expected_line = reference_fd.readline() self.assertEqual(uncompressed_line, expected_line) reference_position = reference_fd.tell() uncompressed_position = compressed_fd.tell() self.assertEqual(uncompressed_position, reference_position)
def test_seek_set(self): for compression_type in [ CompressionTypes.BZIP2, CompressionTypes.DEFLATE, CompressionTypes.GZIP ]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) reference_fd = BytesIO(self.content) # Note: BytesIO's tell() reports out of bound positions (if we seek # beyond the file), therefore we need to cap it to max_position # _CompressedFile.tell() always stays within the bounds of the # uncompressed content. # Negative seek position argument is not supported for BytesIO with # whence set to SEEK_SET. for seek_position in (0, 1, len(self.content) - 1, len(self.content), len(self.content) + 1): compressed_fd.seek(seek_position, os.SEEK_SET) reference_fd.seek(seek_position, os.SEEK_SET) uncompressed_line = compressed_fd.readline() reference_line = reference_fd.readline() self.assertEqual(uncompressed_line, reference_line) uncompressed_position = compressed_fd.tell() reference_position = reference_fd.tell() max_position = len(self.content) reference_position = min(reference_position, max_position) self.assertEqual(uncompressed_position, reference_position)
def test_seek_cur(self): for compression_type in [ CompressionTypes.BZIP2, CompressionTypes.DEFLATE, CompressionTypes.GZIP ]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) reference_fd = BytesIO(self.content) # Test out of bound, inbound seeking in both directions # Note: BytesIO's seek() reports out of bound positions (if we seek # beyond the file), therefore we need to cap it to max_position (to # make it consistent with the old StringIO behavior for seek_position in (-1, 0, 1, len(self.content) // 2, len(self.content) // 2, -1 * len(self.content) // 2): compressed_fd.seek(seek_position, os.SEEK_CUR) reference_fd.seek(seek_position, os.SEEK_CUR) uncompressed_line = compressed_fd.readline() expected_line = reference_fd.readline() self.assertEqual(uncompressed_line, expected_line) reference_position = reference_fd.tell() uncompressed_position = compressed_fd.tell() max_position = len(self.content) reference_position = min(reference_position, max_position) reference_fd.seek(reference_position, os.SEEK_SET) self.assertEqual(uncompressed_position, reference_position)
def test_seek_set(self): for compression_type in [ CompressionTypes.BZIP2, CompressionTypes.GZIP ]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) reference_fd = StringIO(self.content) # Note: content (readline) check must come before position (tell) check # because cStringIO's tell() reports out of bound positions (if we seek # beyond the file) up until a real read occurs. # _CompressedFile.tell() always stays within the bounds of the # uncompressed content. for seek_position in (-1, 0, 1, len(self.content) - 1, len(self.content), len(self.content) + 1): compressed_fd.seek(seek_position, os.SEEK_SET) reference_fd.seek(seek_position, os.SEEK_SET) uncompressed_line = compressed_fd.readline() reference_line = reference_fd.readline() self.assertEqual(uncompressed_line, reference_line) uncompressed_position = compressed_fd.tell() reference_position = reference_fd.tell() self.assertEqual(uncompressed_position, reference_position)
def test_seek_cur(self): for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) reference_fd = StringIO(self.content) # Test out of bound, inbound seeking in both directions for seek_position in (-1, 0, 1, len(self.content) / 2, len(self.content) / 2, -1 * len(self.content) / 2): compressed_fd.seek(seek_position, os.SEEK_CUR) reference_fd.seek(seek_position, os.SEEK_CUR) uncompressed_line = compressed_fd.readline() expected_line = reference_fd.readline() self.assertEqual(uncompressed_line, expected_line) reference_position = reference_fd.tell() uncompressed_position = compressed_fd.tell() self.assertEqual(uncompressed_position, reference_position)
def test_tell(self): lines = [b'line%d\n' % i for i in range(10)] tmpfile = self._create_temp_file() with open(tmpfile, 'wb') as f: writeable = CompressedFile(f) current_offset = 0 for line in lines: writeable.write(line) current_offset += len(line) self.assertEqual(current_offset, writeable.tell()) with open(tmpfile, 'rb') as f: readable = CompressedFile(f) current_offset = 0 while True: line = readable.readline() current_offset += len(line) self.assertEqual(current_offset, readable.tell()) if not line: break
def test_seek_outside(self): for compression_type in [CompressionTypes.BZIP2, CompressionTypes.GZIP]: file_name = self._create_compressed_file(compression_type, self.content) with open(file_name, 'rb') as f: compressed_fd = CompressedFile(f, compression_type, read_size=self.read_block_size) for whence in (os.SEEK_CUR, os.SEEK_SET, os.SEEK_END): seek_position = -1 * len(self.content) - 10 compressed_fd.seek(seek_position, whence) expected_position = 0 uncompressed_position = compressed_fd.tell() self.assertEqual(uncompressed_position, expected_position) seek_position = len(self.content) + 20 compressed_fd.seek(seek_position, whence) expected_position = len(self.content) uncompressed_position = compressed_fd.tell() self.assertEqual(uncompressed_position, expected_position)
def test_seekable_enabled_on_read(self): readable = CompressedFile(open(self._create_temp_file(), 'r')) self.assertTrue(readable.seekable)
def test_seekable(self): readable = CompressedFile(open(self._create_temp_file(), 'r')) self.assertFalse(readable.seekable) writeable = CompressedFile(open(self._create_temp_file(), 'w')) self.assertFalse(writeable.seekable)
def test_concatenated_compressed_file(self): # The test apache_beam.io.textio_test.test_read_gzip_concat # does not encounter the problem in the Beam 2.13 and earlier # code base because the test data is too small: the data is # smaller than read_size, so it goes through logic in the code # that avoids the problem in the code. So, this test sets # read_size smaller and test data bigger, in order to # encounter the problem. It would be difficult to test in the # textio_test module, because you'd need very large test data # because default read_size is 16MiB, and the ReadFromText # interface does not allow you to modify the read_size. import random import threading from six import int2byte num_test_lines = 10 timeout = 30 read_size = (64 << 10) # set much smaller than the line size byte_table = tuple(int2byte(i) for i in range(32, 96)) def generate_random_line(): byte_list = list(b for i in range(4096) for b in random.sample(byte_table, 64)) byte_list.append(b'\n') return b''.join(byte_list) def create_test_file(compression_type, lines): filenames = list() file_name = self._create_temp_file() if compression_type == CompressionTypes.BZIP2: compress_factory = bz2.BZ2File elif compression_type == CompressionTypes.GZIP: compress_factory = gzip.open else: assert False, "Invalid compression type: %s" % compression_type for line in lines: filenames.append(self._create_temp_file()) with compress_factory(filenames[-1], 'wb') as f: f.write(line) with open(file_name, 'wb') as o: for name in filenames: with open(name, 'rb') as i: o.write(i.read()) return file_name # I remember some time ago when a job ran with a real concatenated # gzip file, I got into an endless loop in the beam filesystem module. # That's why I put this handler in to trap an endless loop. However, # this unit test doesn't encounter an endless loop, it encounters a # different error, in the Beam 2.13 and earlier implementation. # So it's not strictly necessary to have this handler in this unit test. def timeout_handler(): raise IOError('Exiting due to likley infinite loop logic in code.') timer = threading.Timer(timeout, timeout_handler) try: test_lines = tuple(generate_random_line() for i in range(num_test_lines)) for compression_type in [ CompressionTypes.BZIP2, CompressionTypes.GZIP ]: file_name = create_test_file(compression_type, test_lines) timer.start() with open(file_name, 'rb') as f: data = CompressedFile(f, compression_type, read_size=read_size) for written_line in test_lines: read_line = data.readline() self.assertEqual(written_line, read_line) timer.cancel() # Starting a new timer for the next iteration/test. timer = threading.Timer(timeout, timeout_handler) finally: timer.cancel()
def test_seekable_disabled_on_write(self): writeable = CompressedFile(open(self._create_temp_file(), 'w')) self.assertFalse(writeable.seekable)
def test_seekable_enabled_on_read(self): with open(self._create_temp_file(), 'rb') as f: readable = CompressedFile(f) self.assertTrue(readable.seekable)
def test_seekable_disabled_on_append(self): with open(self._create_temp_file(), 'ab') as f: writeable = CompressedFile(f) self.assertFalse(writeable.seekable)