Beispiel #1
0
def read_pandas(source, columns=None, nthreads=1, metadata=None):
    """
    Read a Table from Parquet format, reconstructing the index values if
    available.

    Parameters
    ----------
    source: str or pyarrow.io.NativeFile
        Location of Parquet dataset. If a string passed, can be a single file
        name. For passing Python file objects or byte buffers,
        see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
    columns: list
        If not None, only these columns will be read from the file.
    nthreads : int, default 1
        Number of columns to read in parallel. Requires that the underlying
        file source is threadsafe
    metadata : FileMetaData
        If separately computed

    Returns
    -------
    pyarrow.Table
        Content of the file as a Table of Columns, including DataFrame indexes
        as Columns.
    """
    if is_string(source):
        fs = LocalFilesystem.get_instance()
        if fs.isdir(source):
            raise NotImplementedError(
                'Reading a directory of Parquet files with DataFrame index '
                'metadata is not yet supported'
            )

    pf = ParquetFile(source, metadata=metadata)
    return pf.read_pandas(columns=columns, nthreads=nthreads)
def _generate_partition_directories(base_dir, partition_spec, df):
    # partition_spec : list of lists, e.g. [['foo', [0, 1, 2],
    #                                       ['bar', ['a', 'b', 'c']]
    # part_table : a pyarrow.Table to write to each partition
    DEPTH = len(partition_spec)
    fs = LocalFilesystem.get_instance()

    def _visit_level(base_dir, level, part_keys):
        name, values = partition_spec[level]
        for value in values:
            this_part_keys = part_keys + [(name, value)]

            level_dir = pjoin(base_dir, '{0}={1}'.format(name, value))
            fs.mkdir(level_dir)

            if level == DEPTH - 1:
                # Generate example data
                file_path = pjoin(level_dir, 'data.parq')

                filtered_df = _filter_partition(df, this_part_keys)
                part_table = pa.Table.from_pandas(filtered_df)
                _write_table(part_table, file_path)
            else:
                _visit_level(level_dir, level + 1, this_part_keys)

    _visit_level(base_dir, 0, [])
Beispiel #3
0
def read_table(source, columns=None, nthreads=1, metadata=None):
    """
    Read a Table from Parquet format

    Parameters
    ----------
    source: str or pyarrow.io.NativeFile
        Location of Parquet dataset. If a string passed, can be a single file
        name or directory name. For passing Python file objects or byte
        buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
    columns: list
        If not None, only these columns will be read from the file.
    nthreads : int, default 1
        Number of columns to read in parallel. Requires that the underlying
        file source is threadsafe
    metadata : FileMetaData
        If separately computed

    Returns
    -------
    pyarrow.Table
        Content of the file as a table (of columns)
    """
    if is_string(source):
        fs = LocalFilesystem.get_instance()
        if fs.isdir(source):
            return fs.read_parquet(source, columns=columns,
                                   metadata=metadata)

    pf = ParquetFile(source, metadata=metadata)
    return pf.read(columns=columns, nthreads=nthreads)
Beispiel #4
0
def read_table(source, columns=None, nthreads=1, metadata=None):
    """
    Read a Table from Parquet format

    Parameters
    ----------
    source: str or pyarrow.io.NativeFile
        Location of Parquet dataset. If a string passed, can be a single file
        name or directory name. For passing Python file objects or byte
        buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
    columns: list
        If not None, only these columns will be read from the file.
    nthreads : int, default 1
        Number of columns to read in parallel. Requires that the underlying
        file source is threadsafe
    metadata : FileMetaData
        If separately computed

    Returns
    -------
    pyarrow.Table
        Content of the file as a table (of columns)
    """
    from pyarrow.filesystem import LocalFilesystem

    if isinstance(source, six.string_types):
        fs = LocalFilesystem.get_instance()
        if fs.isdir(source):
            return fs.read_parquet(source, columns=columns, metadata=metadata)

    pf = ParquetFile(source, metadata=metadata)
    return pf.read(columns=columns, nthreads=nthreads)
Beispiel #5
0
def _generate_partition_directories(base_dir, partition_spec, df):
    # partition_spec : list of lists, e.g. [['foo', [0, 1, 2],
    #                                       ['bar', ['a', 'b', 'c']]
    # part_table : a pyarrow.Table to write to each partition
    DEPTH = len(partition_spec)
    fs = LocalFilesystem.get_instance()

    def _visit_level(base_dir, level, part_keys):
        name, values = partition_spec[level]
        for value in values:
            this_part_keys = part_keys + [(name, value)]

            level_dir = pjoin(base_dir, '{0}={1}'.format(name, value))
            fs.mkdir(level_dir)

            if level == DEPTH - 1:
                # Generate example data
                file_path = pjoin(level_dir, 'data.parq')

                filtered_df = _filter_partition(df, this_part_keys)
                part_table = pa.Table.from_pandas(filtered_df)
                _write_table(part_table, file_path)
            else:
                _visit_level(level_dir, level + 1, this_part_keys)

    _visit_level(base_dir, 0, [])
Beispiel #6
0
def read_pandas(source, columns=None, nthreads=1, metadata=None):
    """
    Read a Table from Parquet format, reconstructing the index values if
    available.

    Parameters
    ----------
    source: str or pyarrow.io.NativeFile
        Location of Parquet dataset. If a string passed, can be a single file
        name. For passing Python file objects or byte buffers,
        see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
    columns: list
        If not None, only these columns will be read from the file.
    nthreads : int, default 1
        Number of columns to read in parallel. Requires that the underlying
        file source is threadsafe
    metadata : FileMetaData
        If separately computed

    Returns
    -------
    pyarrow.Table
        Content of the file as a Table of Columns, including DataFrame indexes
        as Columns.
    """
    if is_string(source):
        fs = LocalFilesystem.get_instance()
        if fs.isdir(source):
            raise NotImplementedError(
                'Reading a directory of Parquet files with DataFrame index '
                'metadata is not yet supported')

    pf = ParquetFile(source, metadata=metadata)
    return pf.read_pandas(columns=columns, nthreads=nthreads)
Beispiel #7
0
    def __init__(self,
                 path_or_paths,
                 filesystem=None,
                 schema=None,
                 metadata=None,
                 split_row_groups=False,
                 validate_schema=True):
        if filesystem is None:
            self.fs = LocalFilesystem.get_instance()
        else:
            self.fs = filesystem

        self.paths = path_or_paths

        (self.pieces, self.partitions,
         self.metadata_path) = _make_manifest(path_or_paths, self.fs)

        self.metadata = metadata
        self.schema = schema

        self.split_row_groups = split_row_groups

        if split_row_groups:
            raise NotImplementedError("split_row_groups not yet implemented")

        if validate_schema:
            self.validate_schemas()
Beispiel #8
0
    def __init__(self, dirpath, filesystem=None, pathsep='/',
                 partition_scheme='hive'):
        self.filesystem = filesystem or LocalFilesystem.get_instance()
        self.pathsep = pathsep
        self.dirpath = dirpath
        self.partition_scheme = partition_scheme
        self.partitions = ParquetPartitions()
        self.pieces = []

        self.common_metadata_path = None
        self.metadata_path = None

        self._visit_level(0, self.dirpath, [])
Beispiel #9
0
    def __init__(self, dirpath, filesystem=None, pathsep='/',
                 partition_scheme='hive'):
        self.filesystem = filesystem or LocalFilesystem.get_instance()
        self.pathsep = pathsep
        self.dirpath = dirpath
        self.partition_scheme = partition_scheme
        self.partitions = ParquetPartitions()
        self.pieces = []

        self.common_metadata_path = None
        self.metadata_path = None

        self._visit_level(0, self.dirpath, [])
Beispiel #10
0
    def __init__(self, path_or_paths, filesystem=None, schema=None,
                 metadata=None, split_row_groups=False, validate_schema=True):
        if filesystem is None:
            self.fs = LocalFilesystem.get_instance()
        else:
            self.fs = filesystem

        self.paths = path_or_paths

        (self.pieces, self.partitions,
         self.metadata_path) = _make_manifest(path_or_paths, self.fs)

        self.metadata = metadata
        self.schema = schema

        self.split_row_groups = split_row_groups

        if split_row_groups:
            raise NotImplementedError("split_row_groups not yet implemented")

        if validate_schema:
            self.validate_schemas()
Beispiel #11
0
                           ArrowNotImplementedError, ArrowTypeError)

from pyarrow.filesystem import Filesystem, HdfsClient, LocalFilesystem
from pyarrow.io import (HdfsFile, NativeFile, PythonFileInterface, Buffer,
                        BufferReader, InMemoryOutputStream, MemoryMappedFile,
                        memory_map, frombuffer, read_tensor, write_tensor,
                        memory_map, create_memory_map, get_record_batch_size,
                        get_tensor_size)

from pyarrow.ipc import FileReader, FileWriter, StreamReader, StreamWriter

from pyarrow.memory import MemoryPool, total_allocated_bytes

from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType, BooleanValue,
                            Int8Value, Int16Value, Int32Value, Int64Value,
                            UInt8Value, UInt16Value, UInt32Value, UInt64Value,
                            FloatValue, DoubleValue, ListValue, BinaryValue,
                            StringValue, FixedSizeBinaryValue)

import pyarrow.schema as _schema

from pyarrow.schema import (null, bool_, int8, int16, int32, int64, uint8,
                            uint16, uint32, uint64, timestamp, date32, date64,
                            float16, float32, float64, binary, string, decimal,
                            list_, struct, dictionary, field, DataType,
                            FixedSizeBinaryType, Field, Schema, schema)

from pyarrow.table import Column, RecordBatch, Table, concat_tables

localfs = LocalFilesystem.get_instance()
Beispiel #12
0
    pyarrow.set_memory_pool
    """
    from pyarrow._jemalloc import default_pool
    return default_pool()


from pyarrow.filesystem import Filesystem, HdfsClient, LocalFilesystem

from pyarrow.ipc import (RecordBatchFileReader, RecordBatchFileWriter,
                         RecordBatchStreamReader, RecordBatchStreamWriter,
                         open_stream,
                         open_file,
                         serialize_pandas, deserialize_pandas)


localfs = LocalFilesystem.get_instance()


# ----------------------------------------------------------------------
# 0.4.0 deprecations

import warnings

def _deprecate_class(old_name, new_name, klass, next_version='0.5.0'):
    msg = ('pyarrow.{0} has been renamed to '
           '{1}, will be removed in {2}'
           .format(old_name, new_name, next_version))
    def deprecated_factory(*args, **kwargs):
        warnings.warn(msg, FutureWarning)
        return klass(*args)
    return deprecated_factory