Example #1
0
File: feather.py Project: rok/arrow
    def write(self, df):
        if isinstance(df, _pandas_api.pd.SparseDataFrame):
            df = df.to_dense()

        if not df.columns.is_unique:
            raise ValueError("cannot serialize duplicate column names")

        # TODO(wesm): Remove this length check, see ARROW-1732
        if len(df.columns) > 0:
            table = Table.from_pandas(df, preserve_index=False)
            for i, name in enumerate(table.schema.names):
                col = table[i]
                check_chunked_overflow(col)
                self.writer.write_array(name, col.data.chunk(0))

        self.writer.close()
Example #2
0
    def write(self, df):
        if isinstance(df, _pandas_api.pd.SparseDataFrame):
            df = df.to_dense()

        if not df.columns.is_unique:
            raise ValueError("cannot serialize duplicate column names")

        # TODO(wesm): Remove this length check, see ARROW-1732
        if len(df.columns) > 0:
            table = Table.from_pandas(df, preserve_index=False)
            for i, name in enumerate(table.schema.names):
                col = table[i]
                check_chunked_overflow(col)
                self.writer.write_array(name, col.chunk(0))

        self.writer.close()
Example #3
0
    def read(self, columns=None, nthreads=1):
        if columns is not None:
            column_set = set(columns)
        else:
            column_set = None

        columns = []
        names = []
        for i in range(self.num_columns):
            name = self.get_column_name(i)
            if column_set is None or name in column_set:
                col = self.get_column(i)
                columns.append(col)
                names.append(name)

        table = Table.from_arrays(columns, names=names)
        return table.to_pandas(nthreads=nthreads)
Example #4
0
def read_table(source, columns=None, memory_map=True):
    """
    Read a pyarrow.Table from Feather format

    Parameters
    ----------
    source : str file path, or file-like object
    columns : sequence, optional
        Only read a specific set of columns. If not provided, all columns are
        read.
    memory_map : boolean, default True
        Use memory mapping when opening file on disk

    Returns
    -------
    table : pyarrow.Table
    """
    reader = ext.FeatherReader()
    reader.open(source, use_memory_map=memory_map)

    if columns is None:
        return reader.read()

    column_types = [type(column) for column in columns]
    if all(map(lambda t: t == int, column_types)):
        table = reader.read_indices(columns)
    elif all(map(lambda t: t == str, column_types)):
        table = reader.read_names(columns)
    else:
        column_type_names = [t.__name__ for t in column_types]
        raise TypeError("Columns must be indices or names. "
                        "Got columns {} of types {}"
                        .format(columns, column_type_names))

    # Feather v1 already respects the column selection
    if reader.version < 3:
        return table
    # Feather v2 reads with sorted / deduplicated selection
    elif sorted(set(columns)) == columns:
        return table
    else:
        # follow exact order / selection of names
        new_fields = [table.schema.field(c) for c in columns]
        new_schema = schema(new_fields, metadata=table.schema.metadata)
        new_columns = [table.column(c) for c in columns]
        return Table.from_arrays(new_columns, schema=new_schema)
Example #5
0
    def read_table(self, columns=None):
        if columns is not None:
            column_set = set(columns)
        else:
            column_set = None

        columns = []
        names = []
        for i in range(self.num_columns):
            name = self.get_column_name(i)
            if column_set is None or name in column_set:
                col = self.get_column(i)
                columns.append(col)
                names.append(name)

        table = Table.from_arrays(columns, names=names)
        return table
Example #6
0
def write_feather(df, dest, compression=None, compression_level=None,
                  chunksize=None, version=2):
    """
    Write a pandas.DataFrame to Feather format.

    Parameters
    ----------
    df : pandas.DataFrame or pyarrow.Table
        Data to write out as Feather format.
    dest : str
        Local destination path.
    compression : string, default None
        Can be one of {"zstd", "lz4", "uncompressed"}. The default of None uses
        LZ4 for V2 files if it is available, otherwise uncompressed.
    compression_level : int, default None
        Use a compression level particular to the chosen compressor. If None
        use the default compression level
    chunksize : int, default None
        For V2 files, the internal maximum size of Arrow RecordBatch chunks
        when writing the Arrow IPC file format. None means use the default,
        which is currently 64K
    version : int, default 2
        Feather file version. Version 2 is the current. Version 1 is the more
        limited legacy format
    """
    if _pandas_api.have_pandas:
        _check_pandas_version()
        if (_pandas_api.has_sparse and
                isinstance(df, _pandas_api.pd.SparseDataFrame)):
            df = df.to_dense()

    if _pandas_api.is_data_frame(df):
        table = Table.from_pandas(df, preserve_index=False)

        if version == 1:
            # Version 1 does not chunking
            for i, name in enumerate(table.schema.names):
                col = table[i]
                check_chunked_overflow(name, col)
    else:
        table = df

    if version == 1:
        if len(table.column_names) > len(set(table.column_names)):
            raise ValueError("cannot serialize duplicate column names")

        if compression is not None:
            raise ValueError("Feather V1 files do not support compression "
                             "option")

        if chunksize is not None:
            raise ValueError("Feather V1 files do not support chunksize "
                             "option")
    else:
        if compression is None and Codec.is_available('lz4_frame'):
            compression = 'lz4'
        elif (compression is not None and
              compression not in _FEATHER_SUPPORTED_CODECS):
            raise ValueError('compression="{}" not supported, must be '
                             'one of {}'.format(compression,
                                                _FEATHER_SUPPORTED_CODECS))

    try:
        ext.write_feather(table, dest, compression=compression,
                          compression_level=compression_level,
                          chunksize=chunksize, version=version)
    except Exception:
        if isinstance(dest, str):
            try:
                os.remove(dest)
            except os.error:
                pass
        raise