Example #1
0
def _ensure_fs_and_paths(path_or_paths, filesystem=None):
    # Validate and convert the path-likes and filesystem.
    # Returns filesystem and list of string paths or FileSelector
    from pyarrow.fs import FileSystem, FileType, FileSelector

    if isinstance(path_or_paths, list):
        paths_or_selector = [_stringify_path(path) for path in path_or_paths]
        if filesystem is None:
            # infer from first path
            filesystem, _ = FileSystem.from_uri(paths_or_selector[0])
    else:
        path = _stringify_path(path_or_paths)
        if filesystem is None:
            filesystem, path = FileSystem.from_uri(path)

        stats = filesystem.get_target_stats([path])[0]
        if stats.type == FileType.Directory:
            # for directory, pass a selector
            paths_or_selector = FileSelector(path, recursive=True)
        elif stats.type == FileType.File:
            # for a single file path, pass it as a list
            paths_or_selector = [path]
        else:
            raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Example #2
0
File: fs.py Project: tallamjr/arrow
def _resolve_filesystem_and_path(
    path, filesystem=None, allow_legacy_filesystem=False
):
    """
    Return filesystem/path from path which could be an URI or a plain
    filesystem path.
    """
    if not _is_path_like(path):
        if filesystem is not None:
            raise ValueError(
                "'filesystem' passed but the specified path is file-like, so"
                " there is nothing to open with 'filesystem'."
            )
        return filesystem, path

    if filesystem is not None:
        filesystem = _ensure_filesystem(
            filesystem, allow_legacy_filesystem=allow_legacy_filesystem
        )
        if isinstance(filesystem, LocalFileSystem):
            path = _stringify_path(path)
        elif not isinstance(path, str):
            raise TypeError(
                "Expected string path; path-like objects are only allowed "
                "with a local filesystem"
            )
        if not allow_legacy_filesystem:
            path = filesystem.normalize_path(path)
        return filesystem, path

    path = _stringify_path(path)

    # if filesystem is not given, try to automatically determine one
    # first check if the file exists as a local (relative) file path
    # if not then try to parse the path as an URI
    filesystem = LocalFileSystem()
    try:
        file_info = filesystem.get_file_info(path)
    except ValueError:  # ValueError means path is likely an URI
        file_info = None
        exists_locally = False
    else:
        exists_locally = (file_info.type != FileType.NotFound)

    # if the file or directory doesn't exists locally, then assume that
    # the path is an URI describing the file system as well
    if not exists_locally:
        try:
            filesystem, path = FileSystem.from_uri(path)
        except ValueError as e:
            # neither an URI nor a locally existing path, so assume that
            # local path was given and propagate a nicer file not found error
            # instead of a more confusing scheme parsing error
            if "empty scheme" not in str(e):
                raise
    else:
        path = filesystem.normalize_path(path)

    return filesystem, path
def resolve_filesystem_and_path(where, filesystem=None):
    """
    Return filesystem from path which could be an HDFS URI, a local URI,
    or a plain filesystem path.
    """
    if not _is_path_like(where):
        if filesystem is not None:
            raise ValueError("filesystem passed but where is file-like, so"
                             " there is nothing to open with filesystem.")
        return filesystem, where

    if filesystem is not None:
        filesystem = _ensure_filesystem(filesystem)
        if isinstance(filesystem, LocalFileSystem):
            path = _stringify_path(where)
        elif not isinstance(where, str):
            raise TypeError(
                "Expected string path; path-like objects are only allowed "
                "with a local filesystem"
            )
        else:
            path = where
        return filesystem, path

    path = _stringify_path(where)

    parsed_uri = urllib.parse.urlparse(path)
    if parsed_uri.scheme == 'hdfs' or parsed_uri.scheme == 'viewfs':
        # Input is hdfs URI such as hdfs://host:port/myfile.parquet
        netloc_split = parsed_uri.netloc.split(':')
        host = netloc_split[0]
        if host == '':
            host = 'default'
        else:
            host = parsed_uri.scheme + "://" + host
        port = 0
        if len(netloc_split) == 2 and netloc_split[1].isnumeric():
            port = int(netloc_split[1])
        fs = pa.hdfs._connect(host=host, port=port)
        fs_path = parsed_uri.path
    elif parsed_uri.scheme == 'file':
        # Input is local URI such as file:///home/user/myfile.parquet
        fs = LocalFileSystem._get_instance()
        fs_path = parsed_uri.path
    else:
        # Input is local path such as /home/user/myfile.parquet
        fs = LocalFileSystem._get_instance()
        fs_path = path

    return fs, fs_path
Example #4
0
    def __init__(self, dirpath, open_file_func=None, filesystem=None,
                 pathsep='/', partition_scheme='hive', metadata_nthreads=1):
        filesystem, dirpath = _get_filesystem_and_path(filesystem, dirpath)
        self.filesystem = filesystem
        self.open_file_func = open_file_func
        self.pathsep = pathsep
        self.dirpath = _stringify_path(dirpath)
        self.partition_scheme = partition_scheme
        self.partitions = ParquetPartitions()
        self.pieces = []
        self._metadata_nthreads = metadata_nthreads
        self._thread_pool = futures.ThreadPoolExecutor(
            max_workers=metadata_nthreads)

        self.common_metadata_path = None
        self.metadata_path = None

        self._visit_level(0, self.dirpath, [])

        # Due to concurrency, pieces will potentially by out of order if the
        # dataset is partitioned so we sort them to yield stable results
        self.pieces.sort(key=lambda piece: piece.path)

        if self.common_metadata_path is None:
            # _common_metadata is a subset of _metadata
            self.common_metadata_path = self.metadata_path

        self._thread_pool.shutdown()
Example #5
0
def resolve_filesystem_and_path(where, filesystem=None):
    """
    return filesystem from path which could be an HDFS URI
    """
    if not _is_path_like(where):
        if filesystem is not None:
            raise ValueError("filesystem passed but where is file-like, so"
                             " there is nothing to open with filesystem.")
        return filesystem, where

    # input can be hdfs URI such as hdfs://host:port/myfile.parquet
    path = _stringify_path(where)

    if filesystem is not None:
        return _ensure_filesystem(filesystem), path

    parsed_uri = urlparse(path)
    if parsed_uri.scheme == 'hdfs':
        netloc_split = parsed_uri.netloc.split(':')
        host = netloc_split[0]
        if host == '':
            host = 'default'
        port = 0
        if len(netloc_split) == 2 and netloc_split[1].isnumeric():
            port = int(netloc_split[1])
        fs = pa.hdfs.connect(host=host, port=port)
    else:
        fs = LocalFileSystem.get_instance()

    return fs, parsed_uri.path
Example #6
0
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None,
                    partitioning=None, partition_base_dir=None):
    """
    Create a FileSystemDataset from a `_metadata` file created via
    `pyarrrow.parquet.write_metadata`.

    Parameters
    ----------
    metadata_path : path,
        Path pointing to a single file parquet metadata file
    schema : Schema, optional
        Optionally provide the Schema for the Dataset, in which case it will
        not be inferred from the source.
    filesystem : FileSystem or URI string, default None
        If a single path is given as source and filesystem is None, then the
        filesystem will be inferred from the path.
        If an URI string is passed, then a filesystem object is constructed
        using the URI's optional path component as a directory prefix. See the
        examples below.
        Note that the URIs on Windows must follow 'file:///C:...' or
        'file:/C:...' patterns.
    format : ParquetFileFormat
        An instance of a ParquetFileFormat if special options needs to be
        passed.
    partitioning : Partitioning, PartitioningFactory, str, list of str
        The partitioning scheme specified with the ``partitioning()``
        function. A flavor string can be used as shortcut, and with a list of
        field names a DirectionaryPartitioning will be inferred.
    partition_base_dir : str, optional
        For the purposes of applying the partitioning, paths will be
        stripped of the partition_base_dir. Files not matching the
        partition_base_dir prefix will be skipped for partitioning discovery.
        The ignored files will still be part of the Dataset, but will not
        have partition information.

    Returns
    -------
    FileSystemDataset
    """
    from pyarrow.fs import LocalFileSystem, _ensure_filesystem

    if format is None:
        format = ParquetFileFormat()
    elif not isinstance(format, ParquetFileFormat):
        raise ValueError("format argument must be a ParquetFileFormat")

    if filesystem is None:
        filesystem = LocalFileSystem()
    else:
        filesystem = _ensure_filesystem(filesystem)

    metadata_path = filesystem.normalize_path(_stringify_path(metadata_path))
    options = ParquetFactoryOptions(
        partition_base_dir=partition_base_dir,
        partitioning=_ensure_partitioning(partitioning)
    )

    factory = ParquetDatasetFactory(
        metadata_path, filesystem, format, options=options)
    return factory.finish(schema)
Example #7
0
    def disk_usage(self, path):
        """
        Compute bytes used by all contents under indicated path in file tree

        Parameters
        ----------
        path : string
            Can be a file path or directory

        Returns
        -------
        usage : int
        """
        path = _stringify_path(path)
        path_info = self.stat(path)
        if path_info['kind'] == 'file':
            return path_info['size']

        total = 0
        for root, directories, files in self.walk(path):
            for child_path in files:
                abspath = self._path_join(root, child_path)
                total += self.stat(abspath)['size']

        return total
Example #8
0
    def walk(self, path):
        """
        Directory tree generator, like os.walk

        Generator version of what is in gcsfs, which yields a flattened list of
        files
        """
        from gcsfs.core import norm_path
        path = norm_path(_stringify_path(path))
        directories = set()
        files = set()

        for obj in self.fs.ls(path, detail=True):
            # each info name must be at least [path]/part , but here
            # we check also for names like [path]/part/
            obj_path = obj['name']
            if obj_path.strip('/') == path:
                continue
            if obj['type'] == 'directory':
                directories.add(obj_path)
            elif obj['type'] == 'file':
                files.add(obj_path)

        rel_files = sorted(
            [posixpath.split(f)[1] for f in files if f not in directories])
        rel_directories = sorted(
            [posixpath.split(x[:-1])[1] for x in directories])

        yield path, rel_directories, rel_files

        for directory in directories:
            for tup in self.walk(directory):
                yield tup
Example #9
0
    def walk(self, path, refresh=False):
        """
        Directory tree generator, like os.walk

        Generator version of what is in s3fs, which yields a flattened list of
        files
        """
        path = _sanitize_s3(_stringify_path(path))
        directories = set()
        files = set()

        for key in list(self.fs._ls(path, refresh=refresh)):
            path = key['Key']
            if key['StorageClass'] == 'DIRECTORY':
                directories.add(path)
            elif key['StorageClass'] == 'BUCKET':
                pass
            else:
                files.add(path)

        # s3fs creates duplicate 'DIRECTORY' entries
        files = sorted([posixpath.split(f)[1] for f in files
                        if f not in directories])
        directories = sorted([posixpath.split(x)[1]
                              for x in directories])

        yield path, directories, files

        for directory in directories:
            for tup in self.walk(directory, refresh=refresh):
                yield tup
Example #10
0
def write_table(table,
                where,
                row_group_size=None,
                version='1.0',
                use_dictionary=True,
                compression='snappy',
                use_deprecated_int96_timestamps=None,
                coerce_timestamps=None,
                allow_truncated_timestamps=False,
                flavor=None,
                **kwargs):
    row_group_size = kwargs.pop('chunk_size', row_group_size)
    use_int96 = use_deprecated_int96_timestamps
    try:
        with ParquetWriter(
                where,
                table.schema,
                version=version,
                flavor=flavor,
                use_dictionary=use_dictionary,
                coerce_timestamps=coerce_timestamps,
                allow_truncated_timestamps=allow_truncated_timestamps,
                compression=compression,
                use_deprecated_int96_timestamps=use_int96,
                **kwargs) as writer:
            writer.write_table(table, row_group_size=row_group_size)
    except Exception:
        if _is_path_like(where):
            try:
                os.remove(_stringify_path(where))
            except os.error:
                pass
        raise
Example #11
0
    def __init__(self,
                 dirpath,
                 filesystem=None,
                 pathsep='/',
                 partition_scheme='hive',
                 metadata_nthreads=1):
        self.filesystem = filesystem or _get_fs_from_path(dirpath)
        self.pathsep = pathsep
        self.dirpath = _stringify_path(dirpath)
        self.partition_scheme = partition_scheme
        self.partitions = ParquetPartitions()
        self.pieces = []
        self._metadata_nthreads = metadata_nthreads
        self._thread_pool = futures.ThreadPoolExecutor(
            max_workers=metadata_nthreads)

        self.common_metadata_path = None
        self.metadata_path = None

        self._visit_level(0, self.dirpath, [])

        # Due to concurrency, pieces will potentially by out of order if the
        # dataset is partitioned so we sort them to yield stable results
        self.pieces.sort(key=lambda piece: piece.path)

        if self.common_metadata_path is None:
            # _common_metadata is a subset of _metadata
            self.common_metadata_path = self.metadata_path

        self._thread_pool.shutdown()
Example #12
0
 def isfile(self, path):
     path = _sanitize_s3(_stringify_path(path))
     try:
         contents = self.fs.ls(path)
         return len(contents) == 1 and contents[0] == path
     except OSError:
         return False
Example #13
0
 def isfile(self, path):
     path = _sanitize_s3(_stringify_path(path))
     try:
         contents = self.fs.ls(path)
         return len(contents) == 1 and contents[0] == path
     except OSError:
         return False
Example #14
0
    def walk(self, path, refresh=False):
        """
        Directory tree generator, like os.walk

        Generator version of what is in s3fs, which yields a flattened list of
        files
        """
        path = _sanitize_s3(_stringify_path(path))
        directories = set()
        files = set()

        for key in list(self.fs._ls(path, refresh=refresh)):
            path = key['Key']
            if key['StorageClass'] == 'DIRECTORY':
                directories.add(path)
            elif key['StorageClass'] == 'BUCKET':
                pass
            else:
                files.add(path)

        # s3fs creates duplicate 'DIRECTORY' entries
        files = sorted(
            [posixpath.split(f)[1] for f in files if f not in directories])
        directories = sorted([posixpath.split(x)[1] for x in directories])

        yield path, directories, files

        for directory in directories:
            for tup in self.walk(directory, refresh=refresh):
                yield tup
Example #15
0
    def disk_usage(self, path):
        """
        Compute bytes used by all contents under indicated path in file tree

        Parameters
        ----------
        path : string
            Can be a file path or directory

        Returns
        -------
        usage : int
        """
        path = _stringify_path(path)
        path_info = self.stat(path)
        if path_info['kind'] == 'file':
            return path_info['size']

        total = 0
        for root, directories, files in self.walk(path):
            for child_path in files:
                abspath = self._path_join(root, child_path)
                total += self.stat(abspath)['size']

        return total
Example #16
0
def write_table(table, where, row_group_size=None, version='1.0',
                use_dictionary=True, compression='snappy',
                use_deprecated_int96_timestamps=None,
                coerce_timestamps=None,
                allow_truncated_timestamps=False,
                flavor=None, filesystem=None, **kwargs):
    row_group_size = kwargs.pop('chunk_size', row_group_size)
    use_int96 = use_deprecated_int96_timestamps
    try:
        with ParquetWriter(
                where, table.schema,
                filesystem=filesystem,
                version=version,
                flavor=flavor,
                use_dictionary=use_dictionary,
                coerce_timestamps=coerce_timestamps,
                allow_truncated_timestamps=allow_truncated_timestamps,
                compression=compression,
                use_deprecated_int96_timestamps=use_int96,
                **kwargs) as writer:
            writer.write_table(table, row_group_size=row_group_size)
    except Exception:
        if _is_path_like(where):
            try:
                os.remove(_stringify_path(where))
            except os.error:
                pass
        raise
Example #17
0
    def walk(self, path):
        """
        Directory tree generator, like os.walk

        Generator version of what is in gcsfs, which yields a flattened list of
        files
        """
        from gcsfs.core import norm_path
        path = norm_path(_stringify_path(path))
        directories = set()
        files = set()

        for key in self.fs.ls(path, detail=True):
            # each info name must be at least [path]/part , but here
            # we check also for names like [path]/part/
            path = key['name']
            if key['storageClass'] == 'DIRECTORY':
                if path.endswith('/'):
                    directories.add(path[:-1])
                else:
                    directories.add(path)
            elif key['storageClass'] == 'BUCKET':
                pass
            else:
                files.add(path)

        files = sorted(
            [posixpath.split(f)[1] for f in files if f not in directories])
        directories = sorted([posixpath.split(x)[1] for x in directories])

        yield path, directories, files

        for directory in directories:
            for tup in self.walk(directory):
                yield tup
Example #18
0
 def isfile(self, path):
     from gcsfs.core import norm_path
     path = norm_path(_stringify_path(path))
     try:
         contents = self.fs.ls(path)
         return len(contents) == 1 and contents[0] == path
     except OSError:
         return False
Example #19
0
 def __init__(self,
              path,
              open_file_func=partial(open, mode='rb'),
              row_group=None,
              partition_keys=None):
     self.path = _stringify_path(path)
     self.open_file_func = open_file_func
     self.row_group = row_group
     self.partition_keys = partition_keys or []
Example #20
0
def _parse_uri(path):
    path = _stringify_path(path)
    parsed_uri = urlparse(path)
    if parsed_uri.scheme in _URI_STRIP_SCHEMES:
        return parsed_uri.path
    else:
        # ARROW-4073: On Windows returning the path with the scheme
        # stripped removes the drive letter, if any
        return path
Example #21
0
def _parse_uri(path):
    path = _stringify_path(path)
    parsed_uri = urlparse(path)
    if parsed_uri.scheme in _URI_STRIP_SCHEMES:
        return parsed_uri.path
    else:
        # ARROW-4073: On Windows returning the path with the scheme
        # stripped removes the drive letter, if any
        return path
Example #22
0
 def isdir(self, path):
     path = _sanitize_s3(_stringify_path(path))
     try:
         contents = self.fs.ls(path)
         if len(contents) == 1 and contents[0] == path:
             return False
         else:
             return True
     except OSError:
         return False
Example #23
0
 def isdir(self, path):
     path = _sanitize_s3(_stringify_path(path))
     try:
         contents = self.fs.ls(path)
         if len(contents) == 1 and contents[0] == path:
             return False
         else:
             return True
     except OSError:
         return False
Example #24
0
 def isdir(self, path):
     from gcsfs.core import norm_path
     path = norm_path(_stringify_path(path))
     try:
         contents = self.fs.ls(path)
         if len(contents) == 1 and contents[0] == path:
             return False
         else:
             return True
     except OSError:
         return False
Example #25
0
def _ensure_fs_and_paths(path, filesystem=None):
    # Return filesystem and list of string paths or FileSelector
    from pyarrow.fs import FileType, FileSelector

    filesystem, path = _ensure_fs(filesystem, _stringify_path(path))
    infos = filesystem.get_file_info([path])[0]
    if infos.type == FileType.Directory:
        # for directory, pass a selector
        paths_or_selector = FileSelector(path, recursive=True)
    elif infos.type == FileType.File:
        # for a single file path, pass it as a list
        paths_or_selector = [path]
    else:
        raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Example #26
0
def parquet_dataset(metadata_path, schema=None, filesystem=None, format=None):
    """
    Create a FileSystemDataset from a `_metadata` file created via
    `pyarrrow.parquet.write_metadata`.

    Parameters
    ----------
    metadata_path : path,
        Path pointing to a single file parquet metadata file
    schema : Schema, optional
        Optionally provide the Schema for the Dataset, in which case it will
        not be inferred from the source.
    filesystem : FileSystem or URI string, default None
        If a single path is given as source and filesystem is None, then the
        filesystem will be inferred from the path.
        If an URI string is passed, then a filesystem object is constructed
        using the URI's optional path component as a directory prefix. See the
        examples below.
        Note that the URIs on Windows must follow 'file:///C:...' or
        'file:/C:...' patterns.
    format : ParquetFileFormat
        An instance of a ParquetFileFormat if special options needs to be
        passed.

    Returns
    -------
    FileSystemDataset
    """
    from pyarrow.fs import LocalFileSystem

    if format is None:
        format = ParquetFileFormat()
    elif not isinstance(format, ParquetFileFormat):
        raise ValueError("format argument must be a ParquetFileFormat")

    if filesystem is None:
        filesystem = LocalFileSystem()
    else:
        filesystem, _ = _ensure_filesystem(filesystem)

    metadata_path = _normalize_path(filesystem, _stringify_path(metadata_path))

    factory = ParquetDatasetFactory(metadata_path, filesystem, format)
    return factory.finish(schema)
Example #27
0
def _resolve_filesystem_and_path(path,
                                 filesystem=None,
                                 allow_legacy_filesystem=False):
    """
    Return filesystem/path from path which could be an URI or a plain
    filesystem path.
    """
    if not _is_path_like(path):
        if filesystem is not None:
            raise ValueError(
                "'filesystem' passed but the specified path is file-like, so"
                " there is nothing to open with 'filesystem'.")
        return filesystem, path

    path = _stringify_path(path)

    if filesystem is not None:
        filesystem = _ensure_filesystem(
            filesystem, allow_legacy_filesystem=allow_legacy_filesystem)
        return filesystem, path
    else:
        return FileSystem.from_uri(path)
Example #28
0
def _get_fs_from_path(path):
    """
    return filesystem from path which could be an HDFS URI
    """
    # input can be hdfs URI such as hdfs://host:port/myfile.parquet
    path = _stringify_path(path)
    # if _has_pathlib and isinstance(path, pathlib.Path):
    #     path = str(path)
    parsed_uri = urlparse(path)
    if parsed_uri.scheme == 'hdfs':
        netloc_split = parsed_uri.netloc.split(':')
        host = netloc_split[0]
        if host == '':
            host = 'default'
        port = 0
        if len(netloc_split) == 2 and netloc_split[1].isnumeric():
            port = int(netloc_split[1])
        fs = pa.hdfs.connect(host=host, port=port)
    else:
        fs = LocalFileSystem.get_instance()

    return fs
Example #29
0
def resolve_filesystem_and_path(where, filesystem=None):
    """
    Return filesystem from path which could be an HDFS URI, a local URI,
    or a plain filesystem path.
    """
    if not _is_path_like(where):
        if filesystem is not None:
            raise ValueError("filesystem passed but where is file-like, so"
                             " there is nothing to open with filesystem.")
        return filesystem, where

    path = _stringify_path(where)

    if filesystem is not None:
        return _ensure_filesystem(filesystem), path

    parsed_uri = urlparse(path)
    if parsed_uri.scheme == 'hdfs':
        # Input is hdfs URI such as hdfs://host:port/myfile.parquet
        netloc_split = parsed_uri.netloc.split(':')
        host = netloc_split[0]
        if host == '':
            host = 'default'
        port = 0
        if len(netloc_split) == 2 and netloc_split[1].isnumeric():
            port = int(netloc_split[1])
        fs = pa.hdfs.connect(host=host, port=port)
        fs_path = parsed_uri.path
    elif parsed_uri.scheme == 'file':
        # Input is local URI such as file:///home/user/myfile.parquet
        fs = LocalFileSystem.get_instance()
        fs_path = parsed_uri.path
    else:
        # Input is local path such as /home/user/myfile.parquet
        fs = LocalFileSystem.get_instance()
        fs_path = where

    return fs, fs_path
Example #30
0
 def open(self, path, mode='rb'):
     """
     Open file for reading or writing
     """
     path = _stringify_path(path)
     return self.fs.open(path, mode=mode)
Example #31
0
 def ls(self, path, detail=False):
     path = _stringify_path(path)
     return self.fs.ls(path, detail=detail)
Example #32
0
 def walk(self, path):
     """
     Directory tree generator, like os.walk
     """
     path = _stringify_path(path)
     return self.fs.walk(path)
Example #33
0
 def isfile(self, path):
     path = _stringify_path(path)
     return os.path.isfile(path)
Example #34
0
 def walk(self, path):
     """
     Directory tree generator, like os.walk
     """
     path = _stringify_path(path)
     return self.fs.walk(path)
Example #35
0
 def ls(self, path, detail=False):
     path = _stringify_path(path)
     return self.fs.ls(path, detail=detail)
Example #36
0
 def open(self, path, mode='rb'):
     """
     Open file for reading or writing
     """
     path = _stringify_path(path)
     return self.fs.open(path, mode=mode)
Example #37
0
 def mkdir(self, path, create_parents=True):
     path = _stringify_path(path)
     if create_parents:
         return self.fs.mkdirs(path)
     else:
         return self.fs.mkdir(path)
Example #38
0
 def exists(self, path):
     path = _stringify_path(path)
     return self.fs.exists(path)
Example #39
0
 def delete(self, path, recursive=False):
     path = _stringify_path(path)
     return self.fs.rm(path, recursive=recursive)
Example #40
0
 def exists(self, path):
     path = _stringify_path(path)
     return os.path.exists(path)
Example #41
0
 def __init__(self, path, open_file_func=partial(open, mode='rb'),
              row_group=None, partition_keys=None):
     self.path = _stringify_path(path)
     self.open_file_func = open_file_func
     self.row_group = row_group
     self.partition_keys = partition_keys or []
Example #42
0
def _ensure_single_source(path, filesystem=None):
    """
    Treat path as either a recursively traversable directory or a single file.

    Parameters
    ----------
    path : path-like
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

   Returns
    -------
    (FileSystem, list of str or fs.Selector)
        File system object and either a single item list pointing to a file or
        an fs.Selector object pointing to a directory.

    Raises
    ------
    TypeError
        If the passed filesystem has wrong type.
    FileNotFoundError
        If the referenced file or directory doesn't exist.
    """
    from pyarrow.fs import FileSystem, LocalFileSystem, FileType, FileSelector

    path = _stringify_path(path)

    # if filesystem is not given try to automatically determine one
    # first check if the file exists as a local (relative) file path
    # if not then try to parse the path as an URI
    file_info = None
    if filesystem is None:
        filesystem = LocalFileSystem()
        try:
            file_info = filesystem.get_file_info([path])[0]
        except OSError:
            file_info = None
            exists_locally = False
        else:
            exists_locally = (file_info.type != FileType.NotFound)

        # if the file or directory doesn't exists locally, then assume that
        # the path is an URI describing the file system as well
        if not exists_locally:
            try:
                filesystem, path = FileSystem.from_uri(path)
            except ValueError as e:
                # ARROW-8213: neither an URI nor a locally existing path,
                # so assume that local path was given and propagate a nicer
                # file not found error instead of a more confusing scheme
                # parsing error
                if "empty scheme" not in str(e):
                    raise
            else:
                # unset file_info to query it again from the new filesystem
                file_info = None

    # construct a filesystem if it is a valid URI
    filesystem, _ = _ensure_fs(filesystem)

    # ensure that the path is normalized before passing to dataset discovery
    path = filesystem.normalize_path(path)

    # retrieve the file descriptor
    if file_info is None:
        file_info = filesystem.get_file_info([path])[0]

    # depending on the path type either return with a recursive
    # directory selector or as a list containing a single file
    if file_info.type == FileType.Directory:
        paths_or_selector = FileSelector(path, recursive=True)
    elif file_info.type == FileType.File:
        paths_or_selector = [path]
    else:
        raise FileNotFoundError(path)

    return filesystem, paths_or_selector
Example #43
0
 def mkdir(self, path, create_parents=True):
     path = _stringify_path(path)
     if create_parents:
         os.makedirs(path)
     else:
         os.mkdir(path)
Example #44
0
 def mkdir(self, path, create_parents=True):
     path = _stringify_path(path)
     if create_parents:
         return self.fs.mkdirs(path)
     else:
         return self.fs.mkdir(path)
Example #45
0
def _ensure_multiple_sources(paths, filesystem=None):
    """
    Treat a list of paths as files belonging to a single file system

    If the file system is local then also validates that all paths
    are referencing existing *files* otherwise any non-file paths will be
    silently skipped (for example on a remote filesystem).

    Parameters
    ----------
    paths : list of path-like
        Note that URIs are not allowed.
    filesystem : FileSystem or str, optional
        If an URI is passed, then its path component will act as a prefix for
        the file paths.

    Returns
    -------
    (FileSystem, list of str)
        File system object and a list of normalized paths.

    Raises
    ------
    TypeError
        If the passed filesystem has wrong type.
    IOError
        If the file system is local and a referenced path is not available or
        not a file.
    """
    from pyarrow.fs import (LocalFileSystem, SubTreeFileSystem,
                            _MockFileSystem, FileType, _ensure_filesystem)

    if filesystem is None:
        # fall back to local file system as the default
        filesystem = LocalFileSystem()
    else:
        # construct a filesystem if it is a valid URI
        filesystem = _ensure_filesystem(filesystem)

    is_local = (isinstance(filesystem, (LocalFileSystem, _MockFileSystem))
                or (isinstance(filesystem, SubTreeFileSystem)
                    and isinstance(filesystem.base_fs, LocalFileSystem)))

    # allow normalizing irregular paths such as Windows local paths
    paths = [filesystem.normalize_path(_stringify_path(p)) for p in paths]

    # validate that all of the paths are pointing to existing *files*
    # possible improvement is to group the file_infos by type and raise for
    # multiple paths per error category
    if is_local:
        for info in filesystem.get_file_info(paths):
            file_type = info.type
            if file_type == FileType.File:
                continue
            elif file_type == FileType.NotFound:
                raise FileNotFoundError(info.path)
            elif file_type == FileType.Directory:
                raise IsADirectoryError(
                    'Path {} points to a directory, but only file paths are '
                    'supported. To construct a nested or union dataset pass '
                    'a list of dataset objects instead.'.format(info.path))
            else:
                raise IOError(
                    'Path {} exists but its type is unknown (could be a '
                    'special file such as a Unix socket or character device, '
                    'or Windows NUL / CON / ...)'.format(info.path))

    return filesystem, paths
Example #46
0
 def isdir(self, path):
     path = _stringify_path(path)
     return os.path.isdir(path)
Example #47
0
 def delete(self, path, recursive=False):
     path = _stringify_path(path)
     return self.fs.rm(path, recursive=recursive)
Example #48
0
 def exists(self, path):
     path = _stringify_path(path)
     return self.fs.exists(path)
Example #49
0
 def ls(self, path):
     path = _stringify_path(path)
     return sorted(pjoin(path, x) for x in os.listdir(path))
Example #50
0
 def exists(self, path):
     path = _stringify_path(path)
     return os.path.exists(path)