def test_match_multiples(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock gcsio_mock.size_of_files_in_glob.return_value = { 'gs://bucket/file1': 1, 'gs://bucket/file2': 2 } expected_results = set([ FileMetadata('gs://bucket/file1', 1), FileMetadata('gs://bucket/file2', 2) ]) file_system = gcsfilesystem.GCSFileSystem() match_result = file_system.match(['gs://bucket/'])[0] self.assertEqual(set(match_result.metadata_list), expected_results) gcsio_mock.size_of_files_in_glob.assert_called_once_with( 'gs://bucket/*', None)
def _match(pattern, limit): """Find all matching paths to the pattern provided. """ files = glob.glob(pattern) metadata = [ FileMetadata(f, os.path.getsize(f)) for f in files[:limit] ] return MatchResult(pattern, metadata)
def _list(self, url): try: path = self._parse_url(url) for res in self._hdfs_client.list(path, status=True): yield FileMetadata(_HDFS_PREFIX + self._join(path, res[0]), res[1][_FILE_STATUS_LENGTH]) except Exception as e: # pylint: disable=broad-except raise BeamIOError('List operation failed', {url: e})
def test_match_multiple_patterns(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock gcsio_mock.size_of_files_in_glob.side_effect = [ {'gs://bucket/file1': 1}, {'gs://bucket/file2': 2}, ] expected_results = [ [FileMetadata('gs://bucket/file1', 1)], [FileMetadata('gs://bucket/file2', 2)] ] file_system = gcsfilesystem.GCSFileSystem() result = file_system.match(['gs://bucket/file1*', 'gs://bucket/file2*']) self.assertEqual( [mr.metadata_list for mr in result], expected_results)
def test_match_multiples(self, unused_mock_blobstorageio): # Prepare mocks. blobstorageio_mock = mock.MagicMock() blobstoragefilesystem.blobstorageio.BlobStorageIO = \ lambda: blobstorageio_mock blobstorageio_mock.list_prefix.return_value = { 'azfs://storageaccount/container/file1': (1, 99999.0), 'azfs://storageaccount/container/file2': (2, 88888.0) } expected_results = set([ FileMetadata('azfs://storageaccount/container/file1', 1, 99999.0), FileMetadata('azfs://storageaccount/container/file2', 2, 88888.0), ]) match_result = self.fs.match(['azfs://storageaccount/container/'])[0] self.assertEqual(set(match_result.metadata_list), expected_results) blobstorageio_mock.list_prefix.assert_called_once_with( 'azfs://storageaccount/container/', with_metadata=True)
def _match(path_pattern, limit): """Find all matching paths to the pattern provided.""" file_infos = self._hdfs_client.ls(path_pattern, detail=True)[:limit] metadata_list = [ FileMetadata(file_info['name'], file_info['size']) for file_info in file_infos ] return MatchResult(path_pattern, metadata_list)
def test_match_multiples(self, unused_mock_arg): # Prepare mocks. s3io_mock = mock.MagicMock() s3filesystem.s3io.S3IO = lambda: s3io_mock s3io_mock.list_prefix.return_value = { 's3://bucket/file1': 1, 's3://bucket/file2': 2 } expected_results = set([ FileMetadata('s3://bucket/file1', 1), FileMetadata('s3://bucket/file2', 2) ]) match_result = self.fs.match(['s3://bucket/'])[0] self.assertEqual( set(match_result.metadata_list), expected_results) s3io_mock.list_prefix.assert_called_once_with('s3://bucket/')
def test_match_multiples_limit(self, mock_gcsio): # Prepare mocks. gcsio_mock = mock.MagicMock() limit = 1 gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock gcsio_mock.list_prefix.return_value = {'gs://bucket/file1': 1} expected_results = set([FileMetadata('gs://bucket/file1', 1)]) match_result = self.fs.match(['gs://bucket/'], [limit])[0] self.assertEqual(set(match_result.metadata_list), expected_results) self.assertEqual(len(match_result.metadata_list), limit) gcsio_mock.list_prefix.assert_called_once_with('gs://bucket/')
def test_match_multiples_limit(self, unused_mock_blobstorageio): # Prepare mocks. blobstorageio_mock = mock.MagicMock() limit = 1 blobstoragefilesystem.blobstorageio.BlobStorageIO = lambda: blobstorageio_mock blobstorageio_mock.list_prefix.return_value = {'azfs://storageaccount/container/file1': 1} expected_results = set([FileMetadata('azfs://storageaccount/container/file1', 1)]) match_result = self.fs.match(['azfs://storageaccount/container/'], [limit])[0] self.assertEqual(set(match_result.metadata_list), expected_results) self.assertEqual(len(match_result.metadata_list), limit) blobstorageio_mock.list_prefix.assert_called_once_with('azfs://storageaccount/container/')
def test_match_multiples_limit(self, unused_mock_arg): # Prepare mocks. s3io_mock = mock.MagicMock() limit = 1 s3filesystem.s3io.S3IO = lambda options: s3io_mock # type: ignore[misc] s3io_mock.list_prefix.return_value = {'s3://bucket/file1': 1} expected_results = set([FileMetadata('s3://bucket/file1', 1)]) match_result = self.fs.match(['s3://bucket/'], [limit])[0] self.assertEqual(set(match_result.metadata_list), expected_results) self.assertEqual(len(match_result.metadata_list), limit) s3io_mock.list_prefix.assert_called_once_with('s3://bucket/')
def _match(pattern, limit): """Find all matching paths to the pattern provided. """ if pattern.endswith('/'): pattern += '*' file_sizes = gcsio.GcsIO().size_of_files_in_glob(pattern, limit) metadata_list = [ FileMetadata(path, size) for path, size in file_sizes.iteritems() ] return MatchResult(pattern, metadata_list)
def _match(path_pattern, limit): """Find all matching paths to the pattern provided.""" fs = self._hdfs_client.status(path_pattern, strict=False) if fs and fs[_FILE_STATUS_TYPE] == _FILE_STATUS_TYPE_FILE: file_statuses = [(fs[_FILE_STATUS_PATH_SUFFIX], fs)][:limit] else: file_statuses = self._hdfs_client.list(path_pattern, status=True)[:limit] metadata_list = [FileMetadata(file_status[1][_FILE_STATUS_NAME], file_status[1][_FILE_STATUS_SIZE]) for file_status in file_statuses] return MatchResult(path_pattern, metadata_list)
def test_match_multiple_patterns(self, unused_mock_blobstorageio): # Prepare mocks. blobstorageio_mock = mock.MagicMock() blobstoragefilesystem.blobstorageio.BlobStorageIO = \ lambda: blobstorageio_mock blobstorageio_mock.list_prefix.side_effect = [ { 'azfs://storageaccount/container/file1': (1, 99999.0) }, { 'azfs://storageaccount/container/file2': (2, 88888.0) }, ] expected_results = [[ FileMetadata('azfs://storageaccount/container/file1', 1, 99999.0) ], [FileMetadata('azfs://storageaccount/container/file2', 2, 88888.0)]] result = self.fs.match([ 'azfs://storageaccount/container/file1*', 'azfs://storageaccount/container/file2*' ]) self.assertEqual([mr.metadata_list for mr in result], expected_results)
def _match(path_pattern, limit): """Find all matching paths to the pattern provided.""" fs = self._hdfs_client.status(path_pattern, strict=False) if fs and fs[_FILE_STATUS_TYPE] == _FILE_STATUS_TYPE_FILE: file_statuses = [(path_pattern, fs)][:limit] else: file_statuses = [(self._join(path_pattern, fs[0]), fs[1]) for fs in self._hdfs_client.list(path_pattern, status=True)[:limit]] metadata_list = [ FileMetadata(_HDFS_PREFIX + file_status[0], file_status[1][_FILE_STATUS_LENGTH]) for file_status in file_statuses] return MatchResult(path_pattern, metadata_list)
def match(self, patterns, limits=None): test_context = get_current_test_context() file_content_map = test_context.file_content_map all_files = list(file_content_map.keys()) if limits is None: limits = [None] * len(patterns) results = [] for pattern, limit in zip(patterns, limits): files = all_files[:limit] metadata = [ FileMetadata(f, len(file_content_map[f])) for f in files ] results.append(MatchResult(pattern, metadata)) return results
def _list(self, dir_or_prefix): """List files in a location. Listing is non-recursive (for filesystems that support directories). Args: dir_or_prefix: (string) A directory or location prefix (for filesystems that don't have directories). Returns: Generator of ``FileMetadata`` objects. Raises: ``BeamIOError``: if listing fails, but not if no files were found. """ try: for path, (size, updated) in blobstorageio.BlobStorageIO() \ .list_prefix(dir_or_prefix, with_metadata=True).items(): yield FileMetadata(path, size, updated) except Exception as e: # pylint: disable=broad-except raise BeamIOError("List operation failed", {dir_or_prefix: e})
def metadata(self, path): """Fetch metadata fields of a file on the FileSystem. Args: path: string path of a file. Returns: :class:`~apache_beam.io.filesystem.FileMetadata`. Raises: ``BeamIOError``: if path isn't a file or doesn't exist. """ try: file_metadata = s3io.S3IO(options=self._options)._status(path) return FileMetadata(path, file_metadata['size'], file_metadata['last_updated']) except Exception as e: # pylint: disable=broad-except raise BeamIOError("Metadata operation failed", {path: e})
def metadata(self, url): """Fetch metadata fields of a file on the FileSystem. Args: url: string url of a file. Returns: :class:`~apache_beam.io.filesystem.FileMetadata`. Raises: ``BeamIOError``: if url doesn't exist. """ _, path = self._parse_url(url) status = self._hdfs_client.status(path, strict=False) if status is None: raise BeamIOError('File not found: %s' % url) return FileMetadata( url, status[_FILE_STATUS_LENGTH], status[_FILE_STATUS_UPDATED] / 1000.0)
def test_match_single(self, unused_mock_blobstorageio): # Prepare mocks. blobstorageio_mock = mock.MagicMock() blobstoragefilesystem.blobstorageio.BlobStorageIO = \ lambda: blobstorageio_mock blobstorageio_mock.exists.return_value = True blobstorageio_mock._status.return_value = { 'size': 1, 'last_updated': 99999.0 } expected_results = [ FileMetadata('azfs://storageaccount/container/file1', 1, 99999.0) ] match_result = self.fs.match(['azfs://storageaccount/container/file1' ])[0] self.assertEqual(match_result.metadata_list, expected_results) blobstorageio_mock._status.assert_called_once_with( 'azfs://storageaccount/container/file1')
def _list(self, dir_or_prefix): """List files in a location. Listing is non-recursive, for filesystems that support directories. Args: dir_or_prefix: (string) A directory or location prefix (for filesystems that don't have directories). Returns: Generator of ``FileMetadata`` objects. Raises: ``BeamIOError``: if listing fails, but not if no files were found. """ try: for path, size in iteritems(s3io.S3IO().list_prefix(dir_or_prefix)): yield FileMetadata(path, size) except Exception as e: # pylint: disable=broad-except raise BeamIOError("List operation failed", {dir_or_prefix: e})
def _list(self, dir_or_prefix): """List files in a location. Listing is non-recursive, for filesystems that support directories. Args: dir_or_prefix: (string) A directory or location prefix (for filesystems that don't have directories). Returns: Generator of ``FileMetadata`` objects. Raises: ``BeamIOError`` if listing fails, but not if no files were found. """ if not self.exists(dir_or_prefix): return try: for f in os.listdir(dir_or_prefix): f = self.join(dir_or_prefix, f) yield FileMetadata(f, os.path.getsize(f)) except Exception as e: # pylint: disable=broad-except raise BeamIOError("List operation failed", {dir_or_prefix: e})
def _list(self, dir_or_prefix): for path, size in self._files.items(): if path.startswith(dir_or_prefix): yield FileMetadata(path, size)