Esempio n. 1
0
    def read(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a DataFrame.
           Ray DataFrame only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Ray only support pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetFile, ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if os.path.isdir(path):
            partitioned_columns = set()
            directory = True
            original_path = path
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    path = os.path.join(root, files[0])
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas(
                    "Partitioned Columns in Parquet")
                return cls.single_worker_read(original_path,
                                              engine=engine,
                                              columns=columns,
                                              **kwargs)
        else:
            directory = False

        if not columns:
            if directory:
                # Path of the sample file that we will read to get the remaining columns
                pd = ParquetDataset(path)
                column_names = pd.schema.names
            else:
                pf = ParquetFile(path)
                column_names = pf.metadata.schema.names
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]
        return cls.build_query_compiler(path, columns, **kwargs)
Esempio n. 2
0
    def _read(cls, path, engine, columns, **kwargs):
        """
        Load a parquet object from the file path, returning a query compiler.

        Parameters
        ----------
        path : str, path object or file-like object
            The filepath of the parquet file in local filesystem or hdfs.
        engine : str
            Parquet library to use (only 'PyArrow' is supported for now).
        columns : list
            If not None, only these columns will be read from the file.
        **kwargs : dict
            Keyword arguments.

        Returns
        -------
        BaseQueryCompiler
            A new Query Compiler.

        Notes
        -----
        ParquetFile API is used. Please refer to the documentation here
        https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if isinstance(path, str) and os.path.isdir(path):
            partitioned_columns = set()
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas("Mixed Partitioning Columns in Parquet")
                return cls.single_worker_read(
                    path, engine=engine, columns=columns, **kwargs
                )

        if not columns:
            import fsspec.core
            from pandas.io.common import is_fsspec_url

            fs, path_ = (
                fsspec.core.url_to_fs(path, **(kwargs.get("storage_options") or {}))
                if is_fsspec_url(path)
                else (None, path)
            )

            dataset = ParquetDataset(path_, filesystem=fs, use_legacy_dataset=False)
            column_names = dataset.schema.names

            if dataset.schema.pandas_metadata is not None:
                index_columns = dataset.schema.pandas_metadata.get("index_columns", [])
                column_names = [c for c in column_names if c not in index_columns]
            columns = [name for name in column_names if not PQ_INDEX_REGEX.match(name)]
        return cls.build_query_compiler(path, columns, **kwargs)
Esempio n. 3
0
    def _read(cls, path, engine, columns, **kwargs):
        """
        Load a parquet object from the file path, returning a query compiler.

        Parameters
        ----------
        path : str, path object or file-like object
            The filepath of the parquet file in local filesystem or hdfs.
        engine : str
            Parquet library to use (only 'PyArrow' is supported for now).
        columns : list
            If not None, only these columns will be read from the file.
        **kwargs : dict
            Keyword arguments.

        Returns
        -------
        BaseQueryCompiler
            A new Query Compiler.

        Notes
        -----
        ParquetFile API is used. Please refer to the documentation here
        https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetFile, ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if isinstance(path, str) and os.path.isdir(path):
            partitioned_columns = set()
            directory = True
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas(
                    "Mixed Partitioning Columns in Parquet")
                return cls.single_worker_read(path,
                                              engine=engine,
                                              columns=columns,
                                              **kwargs)
        else:
            directory = False
        if not columns:
            import s3fs

            if directory:
                # Path of the sample file that we will read to get the remaining columns
                pd = ParquetDataset(path)
                meta = pd.metadata
                column_names = pd.schema.to_arrow_schema().names
            elif isinstance(path, str) and path.startswith("hdfs://"):
                import fsspec.core

                fs, path = fsspec.core.url_to_fs(path)
                pd = ParquetDataset(path, filesystem=fs)
                meta = pd.metadata
                column_names = pd.schema.to_arrow_schema().names
            elif isinstance(path,
                            s3fs.S3File) or (isinstance(path, str)
                                             and path.startswith("s3://")):
                from botocore.exceptions import NoCredentialsError

                if isinstance(path, s3fs.S3File):
                    bucket_path = path.url().split(".s3.amazonaws.com")
                    path = "s3://" + bucket_path[0].split(
                        "://")[1] + bucket_path[1]
                try:
                    fs = s3fs.S3FileSystem()
                    pd = ParquetDataset(path, filesystem=fs)
                except NoCredentialsError:
                    fs = s3fs.S3FileSystem(anon=True)
                    pd = ParquetDataset(path, filesystem=fs)
                meta = pd.metadata
                column_names = pd.schema.to_arrow_schema().names
            else:
                meta = ParquetFile(path).metadata
                column_names = meta.schema.to_arrow_schema().names

            if meta is not None and meta.metadata is not None:
                pandas_metadata = meta.metadata.get(b"pandas", None)
                if pandas_metadata is not None:
                    import json

                    # This is how we convert the metadata from pyarrow to a python
                    # dictionary, from which we then get the index columns.
                    # We use these to filter out from the columns in the metadata since
                    # the pyarrow storage has no concept of row labels/index.
                    # This ensures that our metadata lines up with the partitions without
                    # extra communication steps once we have done all the remote
                    # computation.
                    index_columns = json.loads(
                        pandas_metadata.decode("utf8")).get(
                            "index_columns", [])
                    column_names = [
                        c for c in column_names if c not in index_columns
                    ]
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]
        return cls.build_query_compiler(path, columns, **kwargs)
Esempio n. 4
0
    def read(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a Modin DataFrame.
           Modin only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Modin only supports pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetFile, ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if os.path.isdir(path):
            partitioned_columns = set()
            directory = True
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas(
                    "Mixed Partitioning Columns in Parquet")
                return cls.single_worker_read(path,
                                              engine=engine,
                                              columns=columns,
                                              **kwargs)
        else:
            directory = False
        if not columns:
            if directory:
                # Path of the sample file that we will read to get the remaining columns
                pd = ParquetDataset(path)
                meta = pd.metadata
                column_names = pd.schema.names
            else:
                meta = ParquetFile(path).metadata
                column_names = meta.schema.names
            if meta is not None:
                # This is how we convert the metadata from pyarrow to a python
                # dictionary, from which we then get the index columns.
                # We use these to filter out from the columns in the metadata since
                # the pyarrow storage has no concept of row labels/index.
                # This ensures that our metadata lines up with the partitions without
                # extra communication steps once we `have done all the remote
                # computation.
                index_columns = eval(meta.metadata[b"pandas"].replace(
                    b"null", b"None")).get("index_columns", [])
                column_names = [
                    c for c in column_names if c not in index_columns
                ]
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]
        return cls.build_query_compiler(path, columns, **kwargs)