예제 #1
0
 def test_match_multiples(self, mock_gcsio):
     # Prepare mocks.
     gcsio_mock = mock.MagicMock()
     gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
     gcsio_mock.size_of_files_in_glob.return_value = {
         'gs://bucket/file1': 1,
         'gs://bucket/file2': 2
     }
     expected_results = set([
         FileMetadata('gs://bucket/file1', 1),
         FileMetadata('gs://bucket/file2', 2)
     ])
     file_system = gcsfilesystem.GCSFileSystem()
     match_result = file_system.match(['gs://bucket/'])[0]
     self.assertEqual(set(match_result.metadata_list), expected_results)
     gcsio_mock.size_of_files_in_glob.assert_called_once_with(
         'gs://bucket/*', None)
예제 #2
0
   def _match(pattern, limit):
       """Find all matching paths to the pattern provided.
 """
       files = glob.glob(pattern)
       metadata = [
           FileMetadata(f, os.path.getsize(f)) for f in files[:limit]
       ]
       return MatchResult(pattern, metadata)
예제 #3
0
 def _list(self, url):
     try:
         path = self._parse_url(url)
         for res in self._hdfs_client.list(path, status=True):
             yield FileMetadata(_HDFS_PREFIX + self._join(path, res[0]),
                                res[1][_FILE_STATUS_LENGTH])
     except Exception as e:  # pylint: disable=broad-except
         raise BeamIOError('List operation failed', {url: e})
예제 #4
0
 def test_match_multiple_patterns(self, mock_gcsio):
   # Prepare mocks.
   gcsio_mock = mock.MagicMock()
   gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
   gcsio_mock.size_of_files_in_glob.side_effect = [
       {'gs://bucket/file1': 1},
       {'gs://bucket/file2': 2},
   ]
   expected_results = [
       [FileMetadata('gs://bucket/file1', 1)],
       [FileMetadata('gs://bucket/file2', 2)]
   ]
   file_system = gcsfilesystem.GCSFileSystem()
   result = file_system.match(['gs://bucket/file1*', 'gs://bucket/file2*'])
   self.assertEqual(
       [mr.metadata_list for mr in result],
       expected_results)
예제 #5
0
    def test_match_multiples(self, unused_mock_blobstorageio):
        # Prepare mocks.
        blobstorageio_mock = mock.MagicMock()
        blobstoragefilesystem.blobstorageio.BlobStorageIO = \
            lambda: blobstorageio_mock
        blobstorageio_mock.list_prefix.return_value = {
            'azfs://storageaccount/container/file1': (1, 99999.0),
            'azfs://storageaccount/container/file2': (2, 88888.0)
        }
        expected_results = set([
            FileMetadata('azfs://storageaccount/container/file1', 1, 99999.0),
            FileMetadata('azfs://storageaccount/container/file2', 2, 88888.0),
        ])
        match_result = self.fs.match(['azfs://storageaccount/container/'])[0]

        self.assertEqual(set(match_result.metadata_list), expected_results)
        blobstorageio_mock.list_prefix.assert_called_once_with(
            'azfs://storageaccount/container/', with_metadata=True)
예제 #6
0
 def _match(path_pattern, limit):
     """Find all matching paths to the pattern provided."""
     file_infos = self._hdfs_client.ls(path_pattern,
                                       detail=True)[:limit]
     metadata_list = [
         FileMetadata(file_info['name'], file_info['size'])
         for file_info in file_infos
     ]
     return MatchResult(path_pattern, metadata_list)
예제 #7
0
  def test_match_multiples(self, unused_mock_arg):
    # Prepare mocks.
    s3io_mock = mock.MagicMock()
    s3filesystem.s3io.S3IO = lambda: s3io_mock
    s3io_mock.list_prefix.return_value = {
        's3://bucket/file1': 1,
        's3://bucket/file2': 2
    }
    expected_results = set([
        FileMetadata('s3://bucket/file1', 1),
        FileMetadata('s3://bucket/file2', 2)
    ])
    match_result = self.fs.match(['s3://bucket/'])[0]

    self.assertEqual(
        set(match_result.metadata_list),
        expected_results)
    s3io_mock.list_prefix.assert_called_once_with('s3://bucket/')
예제 #8
0
 def test_match_multiples_limit(self, mock_gcsio):
     # Prepare mocks.
     gcsio_mock = mock.MagicMock()
     limit = 1
     gcsfilesystem.gcsio.GcsIO = lambda: gcsio_mock
     gcsio_mock.list_prefix.return_value = {'gs://bucket/file1': 1}
     expected_results = set([FileMetadata('gs://bucket/file1', 1)])
     match_result = self.fs.match(['gs://bucket/'], [limit])[0]
     self.assertEqual(set(match_result.metadata_list), expected_results)
     self.assertEqual(len(match_result.metadata_list), limit)
     gcsio_mock.list_prefix.assert_called_once_with('gs://bucket/')
예제 #9
0
 def test_match_multiples_limit(self, unused_mock_blobstorageio):
   # Prepare mocks.
   blobstorageio_mock = mock.MagicMock()
   limit = 1
   blobstoragefilesystem.blobstorageio.BlobStorageIO = lambda: blobstorageio_mock
   blobstorageio_mock.list_prefix.return_value = {'azfs://storageaccount/container/file1': 1}
   expected_results = set([FileMetadata('azfs://storageaccount/container/file1', 1)])
   match_result = self.fs.match(['azfs://storageaccount/container/'], [limit])[0]
   self.assertEqual(set(match_result.metadata_list), expected_results)
   self.assertEqual(len(match_result.metadata_list), limit)
   blobstorageio_mock.list_prefix.assert_called_once_with('azfs://storageaccount/container/')
예제 #10
0
 def test_match_multiples_limit(self, unused_mock_arg):
     # Prepare mocks.
     s3io_mock = mock.MagicMock()
     limit = 1
     s3filesystem.s3io.S3IO = lambda options: s3io_mock  # type: ignore[misc]
     s3io_mock.list_prefix.return_value = {'s3://bucket/file1': 1}
     expected_results = set([FileMetadata('s3://bucket/file1', 1)])
     match_result = self.fs.match(['s3://bucket/'], [limit])[0]
     self.assertEqual(set(match_result.metadata_list), expected_results)
     self.assertEqual(len(match_result.metadata_list), limit)
     s3io_mock.list_prefix.assert_called_once_with('s3://bucket/')
예제 #11
0
   def _match(pattern, limit):
       """Find all matching paths to the pattern provided.
 """
       if pattern.endswith('/'):
           pattern += '*'
       file_sizes = gcsio.GcsIO().size_of_files_in_glob(pattern, limit)
       metadata_list = [
           FileMetadata(path, size)
           for path, size in file_sizes.iteritems()
       ]
       return MatchResult(pattern, metadata_list)
예제 #12
0
 def _match(path_pattern, limit):
   """Find all matching paths to the pattern provided."""
   fs = self._hdfs_client.status(path_pattern, strict=False)
   if fs and fs[_FILE_STATUS_TYPE] == _FILE_STATUS_TYPE_FILE:
     file_statuses = [(fs[_FILE_STATUS_PATH_SUFFIX], fs)][:limit]
   else:
     file_statuses = self._hdfs_client.list(path_pattern,
                                            status=True)[:limit]
   metadata_list = [FileMetadata(file_status[1][_FILE_STATUS_NAME],
                                 file_status[1][_FILE_STATUS_SIZE])
                    for file_status in file_statuses]
   return MatchResult(path_pattern, metadata_list)
예제 #13
0
 def test_match_multiple_patterns(self, unused_mock_blobstorageio):
     # Prepare mocks.
     blobstorageio_mock = mock.MagicMock()
     blobstoragefilesystem.blobstorageio.BlobStorageIO = \
         lambda: blobstorageio_mock
     blobstorageio_mock.list_prefix.side_effect = [
         {
             'azfs://storageaccount/container/file1': (1, 99999.0)
         },
         {
             'azfs://storageaccount/container/file2': (2, 88888.0)
         },
     ]
     expected_results = [[
         FileMetadata('azfs://storageaccount/container/file1', 1, 99999.0)
     ], [FileMetadata('azfs://storageaccount/container/file2', 2, 88888.0)]]
     result = self.fs.match([
         'azfs://storageaccount/container/file1*',
         'azfs://storageaccount/container/file2*'
     ])
     self.assertEqual([mr.metadata_list for mr in result], expected_results)
예제 #14
0
 def _match(path_pattern, limit):
   """Find all matching paths to the pattern provided."""
   fs = self._hdfs_client.status(path_pattern, strict=False)
   if fs and fs[_FILE_STATUS_TYPE] == _FILE_STATUS_TYPE_FILE:
     file_statuses = [(path_pattern, fs)][:limit]
   else:
     file_statuses = [(self._join(path_pattern, fs[0]), fs[1])
                      for fs in self._hdfs_client.list(path_pattern,
                                                       status=True)[:limit]]
   metadata_list = [
       FileMetadata(_HDFS_PREFIX + file_status[0],
                    file_status[1][_FILE_STATUS_LENGTH])
       for file_status in file_statuses]
   return MatchResult(path_pattern, metadata_list)
예제 #15
0
 def match(self, patterns, limits=None):
     test_context = get_current_test_context()
     file_content_map = test_context.file_content_map
     all_files = list(file_content_map.keys())
     if limits is None:
         limits = [None] * len(patterns)
     results = []
     for pattern, limit in zip(patterns, limits):
         files = all_files[:limit]
         metadata = [
             FileMetadata(f, len(file_content_map[f]))
             for f in files
         ]
         results.append(MatchResult(pattern, metadata))
     return results
예제 #16
0
 def _list(self, dir_or_prefix):
     """List files in a location.
 Listing is non-recursive (for filesystems that support directories).
 Args:
   dir_or_prefix: (string) A directory or location prefix (for filesystems
     that don't have directories).
 Returns:
   Generator of ``FileMetadata`` objects.
 Raises:
   ``BeamIOError``: if listing fails, but not if no files were found.
 """
     try:
         for path, (size, updated) in blobstorageio.BlobStorageIO() \
           .list_prefix(dir_or_prefix, with_metadata=True).items():
             yield FileMetadata(path, size, updated)
     except Exception as e:  # pylint: disable=broad-except
         raise BeamIOError("List operation failed", {dir_or_prefix: e})
예제 #17
0
    def metadata(self, path):
        """Fetch metadata fields of a file on the FileSystem.

    Args:
      path: string path of a file.

    Returns:
      :class:`~apache_beam.io.filesystem.FileMetadata`.

    Raises:
      ``BeamIOError``: if path isn't a file or doesn't exist.
    """
        try:
            file_metadata = s3io.S3IO(options=self._options)._status(path)
            return FileMetadata(path, file_metadata['size'],
                                file_metadata['last_updated'])
        except Exception as e:  # pylint: disable=broad-except
            raise BeamIOError("Metadata operation failed", {path: e})
예제 #18
0
  def metadata(self, url):
    """Fetch metadata fields of a file on the FileSystem.

    Args:
      url: string url of a file.

    Returns:
      :class:`~apache_beam.io.filesystem.FileMetadata`.

    Raises:
      ``BeamIOError``: if url doesn't exist.
    """
    _, path = self._parse_url(url)
    status = self._hdfs_client.status(path, strict=False)
    if status is None:
      raise BeamIOError('File not found: %s' % url)
    return FileMetadata(
        url, status[_FILE_STATUS_LENGTH], status[_FILE_STATUS_UPDATED] / 1000.0)
예제 #19
0
    def test_match_single(self, unused_mock_blobstorageio):
        # Prepare mocks.
        blobstorageio_mock = mock.MagicMock()
        blobstoragefilesystem.blobstorageio.BlobStorageIO = \
            lambda: blobstorageio_mock
        blobstorageio_mock.exists.return_value = True
        blobstorageio_mock._status.return_value = {
            'size': 1,
            'last_updated': 99999.0
        }
        expected_results = [
            FileMetadata('azfs://storageaccount/container/file1', 1, 99999.0)
        ]
        match_result = self.fs.match(['azfs://storageaccount/container/file1'
                                      ])[0]

        self.assertEqual(match_result.metadata_list, expected_results)
        blobstorageio_mock._status.assert_called_once_with(
            'azfs://storageaccount/container/file1')
예제 #20
0
  def _list(self, dir_or_prefix):
    """List files in a location.

    Listing is non-recursive, for filesystems that support directories.

    Args:
      dir_or_prefix: (string) A directory or location prefix (for filesystems
        that don't have directories).

    Returns:
      Generator of ``FileMetadata`` objects.

    Raises:
      ``BeamIOError``: if listing fails, but not if no files were found.
    """
    try:
      for path, size in iteritems(s3io.S3IO().list_prefix(dir_or_prefix)):
        yield FileMetadata(path, size)
    except Exception as e:  # pylint: disable=broad-except
      raise BeamIOError("List operation failed", {dir_or_prefix: e})
예제 #21
0
    def _list(self, dir_or_prefix):
        """List files in a location.

    Listing is non-recursive, for filesystems that support directories.

    Args:
      dir_or_prefix: (string) A directory or location prefix (for filesystems
        that don't have directories).

    Returns:
      Generator of ``FileMetadata`` objects.

    Raises:
      ``BeamIOError`` if listing fails, but not if no files were found.
    """
        if not self.exists(dir_or_prefix):
            return

        try:
            for f in os.listdir(dir_or_prefix):
                f = self.join(dir_or_prefix, f)
                yield FileMetadata(f, os.path.getsize(f))
        except Exception as e:  # pylint: disable=broad-except
            raise BeamIOError("List operation failed", {dir_or_prefix: e})
예제 #22
0
 def _list(self, dir_or_prefix):
     for path, size in self._files.items():
         if path.startswith(dir_or_prefix):
             yield FileMetadata(path, size)