Beispiel #1
0
    def read(self, columns=None, nthreads=1):
        """
        Read multiple Parquet files as a single pyarrow.Table

        Parameters
        ----------
        columns : List[str]
            Names of columns to read from the file
        nthreads : int, default 1
            Number of columns to read in parallel. Requires that the underlying
            file source is threadsafe

        Returns
        -------
        pyarrow.Table
            Content of the file as a table (of columns)
        """
        open_file = self._get_open_file_func()

        tables = []
        for piece in self.pieces:
            table = piece.read(columns=columns, nthreads=nthreads,
                               partitions=self.partitions,
                               open_file_func=open_file)
            tables.append(table)

        all_data = lib.concat_tables(tables)
        return all_data
Beispiel #2
0
    def read(self, columns=None, nthreads=1):
        """
        Read multiple Parquet files as a single pyarrow.Table

        Parameters
        ----------
        columns : List[str]
            Names of columns to read from the file
        nthreads : int, default 1
            Number of columns to read in parallel. Requires that the underlying
            file source is threadsafe

        Returns
        -------
        pyarrow.Table
            Content of the file as a table (of columns)
        """
        open_file = self._get_open_file_func()

        tables = []
        for piece in self.pieces:
            table = piece.read(columns=columns,
                               nthreads=nthreads,
                               partitions=self.partitions,
                               open_file_func=open_file)
            tables.append(table)

        all_data = lib.concat_tables(tables)
        return all_data
Beispiel #3
0
    def read(self, columns=None, nthreads=1, use_pandas_metadata=False):
        """
        Read multiple Parquet files as a single pyarrow.Table

        Parameters
        ----------
        columns : List[str]
            Names of columns to read from the file
        nthreads : int, default 1
            Number of columns to read in parallel. Requires that the underlying
            file source is threadsafe
        use_pandas_metadata : bool, default False
            Passed through to each dataset piece

        Returns
        -------
        pyarrow.Table
            Content of the file as a table (of columns)
        """
        open_file = self._get_open_file_func()

        tables = []
        for piece in self.pieces:
            table = piece.read(columns=columns,
                               nthreads=nthreads,
                               partitions=self.partitions,
                               open_file_func=open_file,
                               use_pandas_metadata=use_pandas_metadata)
            tables.append(table)

        all_data = lib.concat_tables(tables)

        if use_pandas_metadata:
            # We need to ensure that this metadata is set in the Table's schema
            # so that Table.to_pandas will construct pandas.DataFrame with the
            # right index
            common_metadata = self._get_common_pandas_metadata()
            current_metadata = all_data.schema.metadata or {}

            if common_metadata and b'pandas' not in current_metadata:
                all_data = all_data.replace_schema_metadata(
                    {b'pandas': common_metadata})

        return all_data
Beispiel #4
0
    def read(self, columns=None, nthreads=1, use_pandas_metadata=False):
        """
        Read multiple Parquet files as a single pyarrow.Table

        Parameters
        ----------
        columns : List[str]
            Names of columns to read from the file
        nthreads : int, default 1
            Number of columns to read in parallel. Requires that the underlying
            file source is threadsafe
        use_pandas_metadata : bool, default False
            Passed through to each dataset piece

        Returns
        -------
        pyarrow.Table
            Content of the file as a table (of columns)
        """
        open_file = self._get_open_file_func()

        tables = []
        for piece in self.pieces:
            table = piece.read(columns=columns, nthreads=nthreads,
                               partitions=self.partitions,
                               open_file_func=open_file,
                               use_pandas_metadata=use_pandas_metadata)
            tables.append(table)

        all_data = lib.concat_tables(tables)

        if use_pandas_metadata:
            # We need to ensure that this metadata is set in the Table's schema
            # so that Table.to_pandas will construct pandas.DataFrame with the
            # right index
            common_metadata = self._get_common_pandas_metadata()
            current_metadata = all_data.schema.metadata or {}

            if common_metadata and b'pandas' not in current_metadata:
                all_data = all_data.replace_schema_metadata({
                    b'pandas': common_metadata})

        return all_data
Beispiel #5
0
    def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
        """
        Read multiple Parquet files as a single pyarrow.Table

        Parameters
        ----------
        columns : List[str]
            Names of columns to read from the file
        use_threads : boolean, default True
            Perform multi-threaded column reads
        use_pandas_metadata : bool, default False
            Passed through to each dataset piece

        Returns
        -------
        pyarrow.Table
            Content of the file as a table (of columns)
        """
        tables = []
        for piece in self.pieces:
            table = piece.read(columns=columns,
                               use_threads=use_threads,
                               partitions=self.partitions,
                               use_pandas_metadata=use_pandas_metadata)
            tables.append(table)

        all_data = lib.concat_tables(tables)

        if use_pandas_metadata:
            # We need to ensure that this metadata is set in the Table's schema
            # so that Table.to_pandas will construct pandas.DataFrame with the
            # right index
            common_metadata = self._get_common_pandas_metadata()
            current_metadata = all_data.schema.metadata or {}

            if common_metadata and b'pandas' not in current_metadata:
                all_data = all_data.replace_schema_metadata(
                    {b'pandas': common_metadata})

        return all_data
Beispiel #6
0
    def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
        """
        Read multiple Parquet files as a single pyarrow.Table

        Parameters
        ----------
        columns : List[str]
            Names of columns to read from the file
        use_threads : boolean, default True
            Perform multi-threaded column reads
        use_pandas_metadata : bool, default False
            Passed through to each dataset piece

        Returns
        -------
        pyarrow.Table
            Content of the file as a table (of columns)
        """
        tables = []
        for piece in self.pieces:
            table = piece.read(columns=columns, use_threads=use_threads,
                               partitions=self.partitions,
                               use_pandas_metadata=use_pandas_metadata)
            tables.append(table)

        all_data = lib.concat_tables(tables)

        if use_pandas_metadata:
            # We need to ensure that this metadata is set in the Table's schema
            # so that Table.to_pandas will construct pandas.DataFrame with the
            # right index
            common_metadata = self._get_common_pandas_metadata()
            current_metadata = all_data.schema.metadata or {}

            if common_metadata and b'pandas' not in current_metadata:
                all_data = all_data.replace_schema_metadata({
                    b'pandas': common_metadata})

        return all_data
    def read_table(self, columns=None):
        """
        Read multiple feather files as a single pyarrow.Table

        Parameters
        ----------
        columns : List[str]
            Names of columns to read from the file

        Returns
        -------
        pyarrow.Table
            Content of the file as a table (of columns)
        """
        _fil = FeatherReader(self.paths[0]).read_table(columns=columns)
        self._tables = [_fil]
        self.schema = _fil.schema

        for fil in self.paths[1:]:
            fil_table = FeatherReader(fil).read_table(columns=columns)
            if self.validate_schema:
                self.validate_schemas(fil, fil_table)
            self._tables.append(fil_table)
        return concat_tables(self._tables)
Beispiel #8
0
    def read_table(self, columns=None):
        """
        Read multiple feather files as a single pyarrow.Table

        Parameters
        ----------
        columns : List[str]
            Names of columns to read from the file

        Returns
        -------
        pyarrow.Table
            Content of the file as a table (of columns)
        """
        _fil = FeatherReader(self.paths[0]).read_table(columns=columns)
        self._tables = [_fil]
        self.schema = _fil.schema

        for fil in self.paths[1:]:
            fil_table = FeatherReader(fil).read_table(columns=columns)
            if self.validate_schema:
                self.validate_schemas(fil, fil_table)
            self._tables.append(fil_table)
        return concat_tables(self._tables)