コード例 #1
0
ファイル: parquet.py プロジェクト: holdenk/arrow
def read_table(source, columns=None, nthreads=1, metadata=None):
    """
    Read a Table from Parquet format

    Parameters
    ----------
    source: str or pyarrow.io.NativeFile
        Location of Parquet dataset. If a string passed, can be a single file
        name or directory name. For passing Python file objects or byte
        buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
    columns: list
        If not None, only these columns will be read from the file.
    nthreads : int, default 1
        Number of columns to read in parallel. Requires that the underlying
        file source is threadsafe
    metadata : FileMetaData
        If separately computed

    Returns
    -------
    pyarrow.Table
        Content of the file as a table (of columns)
    """
    if is_string(source):
        fs = LocalFilesystem.get_instance()
        if fs.isdir(source):
            return fs.read_parquet(source, columns=columns,
                                   metadata=metadata)

    pf = ParquetFile(source, metadata=metadata)
    return pf.read(columns=columns, nthreads=nthreads)
コード例 #2
0
ファイル: parquet.py プロジェクト: holdenk/arrow
def read_pandas(source, columns=None, nthreads=1, metadata=None):
    """
    Read a Table from Parquet format, reconstructing the index values if
    available.

    Parameters
    ----------
    source: str or pyarrow.io.NativeFile
        Location of Parquet dataset. If a string passed, can be a single file
        name. For passing Python file objects or byte buffers,
        see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
    columns: list
        If not None, only these columns will be read from the file.
    nthreads : int, default 1
        Number of columns to read in parallel. Requires that the underlying
        file source is threadsafe
    metadata : FileMetaData
        If separately computed

    Returns
    -------
    pyarrow.Table
        Content of the file as a Table of Columns, including DataFrame indexes
        as Columns.
    """
    if is_string(source):
        fs = LocalFilesystem.get_instance()
        if fs.isdir(source):
            raise NotImplementedError(
                'Reading a directory of Parquet files with DataFrame index '
                'metadata is not yet supported'
            )

    pf = ParquetFile(source, metadata=metadata)
    return pf.read_pandas(columns=columns, nthreads=nthreads)
コード例 #3
0
def read_table(source, columns=None, nthreads=1, metadata=None):
    """
    Read a Table from Parquet format

    Parameters
    ----------
    source: str or pyarrow.io.NativeFile
        Location of Parquet dataset. If a string passed, can be a single file
        name or directory name. For passing Python file objects or byte
        buffers, see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
    columns: list
        If not None, only these columns will be read from the file.
    nthreads : int, default 1
        Number of columns to read in parallel. Requires that the underlying
        file source is threadsafe
    metadata : FileMetaData
        If separately computed

    Returns
    -------
    pyarrow.Table
        Content of the file as a table (of columns)
    """
    from pyarrow.filesystem import LocalFilesystem

    if isinstance(source, six.string_types):
        fs = LocalFilesystem.get_instance()
        if fs.isdir(source):
            return fs.read_parquet(source, columns=columns, metadata=metadata)

    pf = ParquetFile(source, metadata=metadata)
    return pf.read(columns=columns, nthreads=nthreads)
コード例 #4
0
ファイル: test_parquet.py プロジェクト: sudheeshkatkam/arrow
def _generate_partition_directories(base_dir, partition_spec, df):
    # partition_spec : list of lists, e.g. [['foo', [0, 1, 2],
    #                                       ['bar', ['a', 'b', 'c']]
    # part_table : a pyarrow.Table to write to each partition
    DEPTH = len(partition_spec)
    fs = LocalFilesystem.get_instance()

    def _visit_level(base_dir, level, part_keys):
        name, values = partition_spec[level]
        for value in values:
            this_part_keys = part_keys + [(name, value)]

            level_dir = pjoin(base_dir, '{0}={1}'.format(name, value))
            fs.mkdir(level_dir)

            if level == DEPTH - 1:
                # Generate example data
                file_path = pjoin(level_dir, 'data.parq')

                filtered_df = _filter_partition(df, this_part_keys)
                part_table = pa.Table.from_pandas(filtered_df)
                _write_table(part_table, file_path)
            else:
                _visit_level(level_dir, level + 1, this_part_keys)

    _visit_level(base_dir, 0, [])
コード例 #5
0
def _generate_partition_directories(base_dir, partition_spec, df):
    # partition_spec : list of lists, e.g. [['foo', [0, 1, 2],
    #                                       ['bar', ['a', 'b', 'c']]
    # part_table : a pyarrow.Table to write to each partition
    DEPTH = len(partition_spec)
    fs = LocalFilesystem.get_instance()

    def _visit_level(base_dir, level, part_keys):
        name, values = partition_spec[level]
        for value in values:
            this_part_keys = part_keys + [(name, value)]

            level_dir = pjoin(base_dir, '{0}={1}'.format(name, value))
            fs.mkdir(level_dir)

            if level == DEPTH - 1:
                # Generate example data
                file_path = pjoin(level_dir, 'data.parq')

                filtered_df = _filter_partition(df, this_part_keys)
                part_table = pa.Table.from_pandas(filtered_df)
                _write_table(part_table, file_path)
            else:
                _visit_level(level_dir, level + 1, this_part_keys)

    _visit_level(base_dir, 0, [])
コード例 #6
0
def read_pandas(source, columns=None, nthreads=1, metadata=None):
    """
    Read a Table from Parquet format, reconstructing the index values if
    available.

    Parameters
    ----------
    source: str or pyarrow.io.NativeFile
        Location of Parquet dataset. If a string passed, can be a single file
        name. For passing Python file objects or byte buffers,
        see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader.
    columns: list
        If not None, only these columns will be read from the file.
    nthreads : int, default 1
        Number of columns to read in parallel. Requires that the underlying
        file source is threadsafe
    metadata : FileMetaData
        If separately computed

    Returns
    -------
    pyarrow.Table
        Content of the file as a Table of Columns, including DataFrame indexes
        as Columns.
    """
    if is_string(source):
        fs = LocalFilesystem.get_instance()
        if fs.isdir(source):
            raise NotImplementedError(
                'Reading a directory of Parquet files with DataFrame index '
                'metadata is not yet supported')

    pf = ParquetFile(source, metadata=metadata)
    return pf.read_pandas(columns=columns, nthreads=nthreads)
コード例 #7
0
    def __init__(self,
                 path_or_paths,
                 filesystem=None,
                 schema=None,
                 metadata=None,
                 split_row_groups=False,
                 validate_schema=True):
        if filesystem is None:
            self.fs = LocalFilesystem.get_instance()
        else:
            self.fs = filesystem

        self.paths = path_or_paths

        (self.pieces, self.partitions,
         self.metadata_path) = _make_manifest(path_or_paths, self.fs)

        self.metadata = metadata
        self.schema = schema

        self.split_row_groups = split_row_groups

        if split_row_groups:
            raise NotImplementedError("split_row_groups not yet implemented")

        if validate_schema:
            self.validate_schemas()
コード例 #8
0
ファイル: parquet.py プロジェクト: holdenk/arrow
    def __init__(self, dirpath, filesystem=None, pathsep='/',
                 partition_scheme='hive'):
        self.filesystem = filesystem or LocalFilesystem.get_instance()
        self.pathsep = pathsep
        self.dirpath = dirpath
        self.partition_scheme = partition_scheme
        self.partitions = ParquetPartitions()
        self.pieces = []

        self.common_metadata_path = None
        self.metadata_path = None

        self._visit_level(0, self.dirpath, [])
コード例 #9
0
    def __init__(self, dirpath, filesystem=None, pathsep='/',
                 partition_scheme='hive'):
        self.filesystem = filesystem or LocalFilesystem.get_instance()
        self.pathsep = pathsep
        self.dirpath = dirpath
        self.partition_scheme = partition_scheme
        self.partitions = ParquetPartitions()
        self.pieces = []

        self.common_metadata_path = None
        self.metadata_path = None

        self._visit_level(0, self.dirpath, [])
コード例 #10
0
ファイル: parquet.py プロジェクト: holdenk/arrow
    def __init__(self, path_or_paths, filesystem=None, schema=None,
                 metadata=None, split_row_groups=False, validate_schema=True):
        if filesystem is None:
            self.fs = LocalFilesystem.get_instance()
        else:
            self.fs = filesystem

        self.paths = path_or_paths

        (self.pieces, self.partitions,
         self.metadata_path) = _make_manifest(path_or_paths, self.fs)

        self.metadata = metadata
        self.schema = schema

        self.split_row_groups = split_row_groups

        if split_row_groups:
            raise NotImplementedError("split_row_groups not yet implemented")

        if validate_schema:
            self.validate_schemas()
コード例 #11
0
                           ArrowNotImplementedError, ArrowTypeError)

from pyarrow.filesystem import Filesystem, HdfsClient, LocalFilesystem
from pyarrow.io import (HdfsFile, NativeFile, PythonFileInterface, Buffer,
                        BufferReader, InMemoryOutputStream, MemoryMappedFile,
                        memory_map, frombuffer, read_tensor, write_tensor,
                        memory_map, create_memory_map, get_record_batch_size,
                        get_tensor_size)

from pyarrow.ipc import FileReader, FileWriter, StreamReader, StreamWriter

from pyarrow.memory import MemoryPool, total_allocated_bytes

from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType, BooleanValue,
                            Int8Value, Int16Value, Int32Value, Int64Value,
                            UInt8Value, UInt16Value, UInt32Value, UInt64Value,
                            FloatValue, DoubleValue, ListValue, BinaryValue,
                            StringValue, FixedSizeBinaryValue)

import pyarrow.schema as _schema

from pyarrow.schema import (null, bool_, int8, int16, int32, int64, uint8,
                            uint16, uint32, uint64, timestamp, date32, date64,
                            float16, float32, float64, binary, string, decimal,
                            list_, struct, dictionary, field, DataType,
                            FixedSizeBinaryType, Field, Schema, schema)

from pyarrow.table import Column, RecordBatch, Table, concat_tables

localfs = LocalFilesystem.get_instance()
コード例 #12
0
ファイル: __init__.py プロジェクト: hdfeos/arrow
    pyarrow.set_memory_pool
    """
    from pyarrow._jemalloc import default_pool
    return default_pool()


from pyarrow.filesystem import Filesystem, HdfsClient, LocalFilesystem

from pyarrow.ipc import (RecordBatchFileReader, RecordBatchFileWriter,
                         RecordBatchStreamReader, RecordBatchStreamWriter,
                         open_stream,
                         open_file,
                         serialize_pandas, deserialize_pandas)


localfs = LocalFilesystem.get_instance()


# ----------------------------------------------------------------------
# 0.4.0 deprecations

import warnings

def _deprecate_class(old_name, new_name, klass, next_version='0.5.0'):
    msg = ('pyarrow.{0} has been renamed to '
           '{1}, will be removed in {2}'
           .format(old_name, new_name, next_version))
    def deprecated_factory(*args, **kwargs):
        warnings.warn(msg, FutureWarning)
        return klass(*args)
    return deprecated_factory
コード例 #13
0
ファイル: __init__.py プロジェクト: kiril-me/arrow
                           NumericArray, IntegerArray, FloatingPointArray,
                           BooleanArray, Int8Array, UInt8Array, Int16Array,
                           UInt16Array, Int32Array, UInt32Array, Int64Array,
                           UInt64Array, ListArray, StringArray,
                           DictionaryArray)

from pyarrow.error import ArrowException

from pyarrow.filesystem import Filesystem, HdfsClient, LocalFilesystem
from pyarrow.io import (HdfsFile, NativeFile, PythonFileInterface, Buffer,
                        InMemoryOutputStream, BufferReader)

from pyarrow.ipc import FileReader, FileWriter, StreamReader, StreamWriter

from pyarrow.memory import MemoryPool, total_allocated_bytes

from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType, BooleanValue,
                            Int8Value, Int16Value, Int32Value, Int64Value,
                            UInt8Value, UInt16Value, UInt32Value, UInt64Value,
                            FloatValue, DoubleValue, ListValue, BinaryValue,
                            StringValue)

from pyarrow.schema import (null, bool_, int8, int16, int32, int64, uint8,
                            uint16, uint32, uint64, timestamp, date, float_,
                            double, binary, string, list_, struct, dictionary,
                            field, DataType, Field, Schema, schema)

from pyarrow.table import Column, RecordBatch, Table, concat_tables

localfs = LocalFilesystem()