def read(self, columns=None, nthreads=1): """ Read multiple Parquet files as a single pyarrow.Table Parameters ---------- columns : List[str] Names of columns to read from the file nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe Returns ------- pyarrow.Table Content of the file as a table (of columns) """ open_file = self._get_open_file_func() tables = [] for piece in self.pieces: table = piece.read(columns=columns, nthreads=nthreads, partitions=self.partitions, open_file_func=open_file) tables.append(table) all_data = lib.concat_tables(tables) return all_data
def read(self, columns=None, nthreads=1, use_pandas_metadata=False): """ Read multiple Parquet files as a single pyarrow.Table Parameters ---------- columns : List[str] Names of columns to read from the file nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe use_pandas_metadata : bool, default False Passed through to each dataset piece Returns ------- pyarrow.Table Content of the file as a table (of columns) """ open_file = self._get_open_file_func() tables = [] for piece in self.pieces: table = piece.read(columns=columns, nthreads=nthreads, partitions=self.partitions, open_file_func=open_file, use_pandas_metadata=use_pandas_metadata) tables.append(table) all_data = lib.concat_tables(tables) if use_pandas_metadata: # We need to ensure that this metadata is set in the Table's schema # so that Table.to_pandas will construct pandas.DataFrame with the # right index common_metadata = self._get_common_pandas_metadata() current_metadata = all_data.schema.metadata or {} if common_metadata and b'pandas' not in current_metadata: all_data = all_data.replace_schema_metadata( {b'pandas': common_metadata}) return all_data
def read(self, columns=None, nthreads=1, use_pandas_metadata=False): """ Read multiple Parquet files as a single pyarrow.Table Parameters ---------- columns : List[str] Names of columns to read from the file nthreads : int, default 1 Number of columns to read in parallel. Requires that the underlying file source is threadsafe use_pandas_metadata : bool, default False Passed through to each dataset piece Returns ------- pyarrow.Table Content of the file as a table (of columns) """ open_file = self._get_open_file_func() tables = [] for piece in self.pieces: table = piece.read(columns=columns, nthreads=nthreads, partitions=self.partitions, open_file_func=open_file, use_pandas_metadata=use_pandas_metadata) tables.append(table) all_data = lib.concat_tables(tables) if use_pandas_metadata: # We need to ensure that this metadata is set in the Table's schema # so that Table.to_pandas will construct pandas.DataFrame with the # right index common_metadata = self._get_common_pandas_metadata() current_metadata = all_data.schema.metadata or {} if common_metadata and b'pandas' not in current_metadata: all_data = all_data.replace_schema_metadata({ b'pandas': common_metadata}) return all_data
def read(self, columns=None, use_threads=True, use_pandas_metadata=False): """ Read multiple Parquet files as a single pyarrow.Table Parameters ---------- columns : List[str] Names of columns to read from the file use_threads : boolean, default True Perform multi-threaded column reads use_pandas_metadata : bool, default False Passed through to each dataset piece Returns ------- pyarrow.Table Content of the file as a table (of columns) """ tables = [] for piece in self.pieces: table = piece.read(columns=columns, use_threads=use_threads, partitions=self.partitions, use_pandas_metadata=use_pandas_metadata) tables.append(table) all_data = lib.concat_tables(tables) if use_pandas_metadata: # We need to ensure that this metadata is set in the Table's schema # so that Table.to_pandas will construct pandas.DataFrame with the # right index common_metadata = self._get_common_pandas_metadata() current_metadata = all_data.schema.metadata or {} if common_metadata and b'pandas' not in current_metadata: all_data = all_data.replace_schema_metadata( {b'pandas': common_metadata}) return all_data
def read(self, columns=None, use_threads=True, use_pandas_metadata=False): """ Read multiple Parquet files as a single pyarrow.Table Parameters ---------- columns : List[str] Names of columns to read from the file use_threads : boolean, default True Perform multi-threaded column reads use_pandas_metadata : bool, default False Passed through to each dataset piece Returns ------- pyarrow.Table Content of the file as a table (of columns) """ tables = [] for piece in self.pieces: table = piece.read(columns=columns, use_threads=use_threads, partitions=self.partitions, use_pandas_metadata=use_pandas_metadata) tables.append(table) all_data = lib.concat_tables(tables) if use_pandas_metadata: # We need to ensure that this metadata is set in the Table's schema # so that Table.to_pandas will construct pandas.DataFrame with the # right index common_metadata = self._get_common_pandas_metadata() current_metadata = all_data.schema.metadata or {} if common_metadata and b'pandas' not in current_metadata: all_data = all_data.replace_schema_metadata({ b'pandas': common_metadata}) return all_data
def read_table(self, columns=None): """ Read multiple feather files as a single pyarrow.Table Parameters ---------- columns : List[str] Names of columns to read from the file Returns ------- pyarrow.Table Content of the file as a table (of columns) """ _fil = FeatherReader(self.paths[0]).read_table(columns=columns) self._tables = [_fil] self.schema = _fil.schema for fil in self.paths[1:]: fil_table = FeatherReader(fil).read_table(columns=columns) if self.validate_schema: self.validate_schemas(fil, fil_table) self._tables.append(fil_table) return concat_tables(self._tables)