Esempio n. 1
0
    def file_exists(cls, file_path: str) -> bool:
        """
        Check if the `file_path` is valid.

        Parameters
        ----------
        file_path : str
            String representing a path.

        Returns
        -------
        bool
            True if the path is valid.
        """
        if isinstance(file_path, str):
            match = S3_ADDRESS_REGEX.search(file_path)
            if match is not None:
                if file_path[0] == "S":
                    file_path = "{}{}".format("s", file_path[1:])
                S3FS = import_optional_dependency(
                    "s3fs", "Module s3fs is required to read S3FS files."
                )
                from botocore.exceptions import NoCredentialsError

                s3fs = S3FS.S3FileSystem(anon=False)
                exists = False
                try:
                    exists = len(s3fs.glob(file_path)) > 0 or exists
                except NoCredentialsError:
                    pass
                s3fs = S3FS.S3FileSystem(anon=True)
                return exists or len(s3fs.glob(file_path)) > 0
        return len(glob.glob(file_path)) > 0
Esempio n. 2
0
    def file_exists(cls, file_path):
        """
        Check if `file_path` exists.

        Parameters
        ----------
        file_path : str
            String that represents the path to the file (paths to S3 buckets
            are also acceptable).

        Returns
        -------
        bool
            Whether file exists or not.
        """
        if isinstance(file_path, str):
            match = S3_ADDRESS_REGEX.search(file_path)
            if match is not None:
                if file_path[0] == "S":
                    file_path = "{}{}".format("s", file_path[1:])
                S3FS = import_optional_dependency(
                    "s3fs", "Module s3fs is required to read S3FS files.")
                from botocore.exceptions import NoCredentialsError

                s3fs = S3FS.S3FileSystem(anon=False)
                exists = False
                try:
                    exists = s3fs.exists(file_path) or exists
                except NoCredentialsError:
                    pass
                s3fs = S3FS.S3FileSystem(anon=True)
                return exists or s3fs.exists(file_path)
        return os.path.exists(file_path)
Esempio n. 3
0
    def _read(cls, path, columns=None, **kwargs):
        """
        Read data from the file path, returning a query compiler.

        Parameters
        ----------
        path : str or file-like object
            The filepath of the feather file.
        columns : array-like, optional
            Columns to read from file. If not provided, all columns are read.
        **kwargs : dict
            `read_feather` function kwargs.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.

        Notes
        -----
        `PyArrow` engine and local files only are supported for now,
        multi threading is set to False by default.
        PyArrow feather is used. Please refer to the documentation here
        https://arrow.apache.org/docs/python/api.html#feather-format
        """
        path = cls.get_path(path)
        if columns is None:
            import_optional_dependency(
                "pyarrow", "pyarrow is required to read feather files.")
            from pyarrow.feather import read_feather

            with OpenFile(
                    path,
                    **(kwargs.get("storage_options", None) or {}),
            ) as file:
                df = read_feather(file)
            # pyarrow.feather.read_feather doesn't support columns as pandas.Index
            columns = list(df.columns)
        return cls.build_query_compiler(path, columns, use_threads=False)
Esempio n. 4
0
    def get_path(cls, file_path: str) -> list:
        """
        Return the path of the file(s).

        Parameters
        ----------
        file_path : str
            String representing a path.

        Returns
        -------
        list
            List of strings of absolute file paths.
        """
        if S3_ADDRESS_REGEX.search(file_path):
            # S3FS does not allow captial S in s3 addresses.
            if file_path[0] == "S":
                file_path = "{}{}".format("s", file_path[1:])

            S3FS = import_optional_dependency(
                "s3fs", "Module s3fs is required to read S3FS files.")
            from botocore.exceptions import NoCredentialsError

            def get_file_path(fs_handle) -> List[str]:
                file_paths = fs_handle.glob(file_path)
                s3_addresses = [
                    "{}{}".format("s3://", path) for path in file_paths
                ]
                return s3_addresses

            s3fs = S3FS.S3FileSystem(anon=False)
            try:
                return get_file_path(s3fs)
            except NoCredentialsError:
                pass
            s3fs = S3FS.S3FileSystem(anon=True)
            return get_file_path(s3fs)
        else:
            relative_paths = glob.glob(file_path)
            abs_paths = [os.path.abspath(path) for path in relative_paths]
            return abs_paths
Esempio n. 5
0
    def _read(cls, path, engine, columns, **kwargs):
        """
        Load a parquet object from the file path, returning a query compiler.

        Parameters
        ----------
        path : str, path object or file-like object
            The filepath of the parquet file in local filesystem or hdfs.
        engine : str
            Parquet library to use (only 'PyArrow' is supported for now).
        columns : list
            If not None, only these columns will be read from the file.
        **kwargs : dict
            Keyword arguments.

        Returns
        -------
        BaseQueryCompiler
            A new Query Compiler.

        Notes
        -----
        ParquetFile API is used. Please refer to the documentation here
        https://arrow.apache.org/docs/python/parquet.html
        """
        import_optional_dependency(
            "pyarrow",
            "pyarrow is required to read parquet files.",
        )
        from pyarrow.parquet import ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if isinstance(path, str) and os.path.isdir(path):
            partitioned_columns = set()
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas("Mixed Partitioning Columns in Parquet")
                return cls.single_worker_read(
                    path, engine=engine, columns=columns, **kwargs
                )

        if not columns:
            import fsspec.core
            from pandas.io.common import is_fsspec_url

            fs, path_ = (
                fsspec.core.url_to_fs(path, **(kwargs.get("storage_options") or {}))
                if is_fsspec_url(path)
                else (None, path)
            )

            dataset = ParquetDataset(path_, filesystem=fs, use_legacy_dataset=False)
            column_names = dataset.schema.names

            if dataset.schema.pandas_metadata is not None:
                index_columns = dataset.schema.pandas_metadata.get("index_columns", [])
                column_names = [c for c in column_names if c not in index_columns]
            columns = [name for name in column_names if not PQ_INDEX_REGEX.match(name)]
        return cls.build_query_compiler(path, columns, **kwargs)