def write(self, df): if isinstance(df, _pandas_api.pd.SparseDataFrame): df = df.to_dense() if not df.columns.is_unique: raise ValueError("cannot serialize duplicate column names") # TODO(wesm): Remove this length check, see ARROW-1732 if len(df.columns) > 0: table = Table.from_pandas(df, preserve_index=False) for i, name in enumerate(table.schema.names): col = table[i] check_chunked_overflow(col) self.writer.write_array(name, col.data.chunk(0)) self.writer.close()
def write(self, df): if isinstance(df, _pandas_api.pd.SparseDataFrame): df = df.to_dense() if not df.columns.is_unique: raise ValueError("cannot serialize duplicate column names") # TODO(wesm): Remove this length check, see ARROW-1732 if len(df.columns) > 0: table = Table.from_pandas(df, preserve_index=False) for i, name in enumerate(table.schema.names): col = table[i] check_chunked_overflow(col) self.writer.write_array(name, col.chunk(0)) self.writer.close()
def read(self, columns=None, nthreads=1): if columns is not None: column_set = set(columns) else: column_set = None columns = [] names = [] for i in range(self.num_columns): name = self.get_column_name(i) if column_set is None or name in column_set: col = self.get_column(i) columns.append(col) names.append(name) table = Table.from_arrays(columns, names=names) return table.to_pandas(nthreads=nthreads)
def read_table(source, columns=None, memory_map=True): """ Read a pyarrow.Table from Feather format Parameters ---------- source : str file path, or file-like object columns : sequence, optional Only read a specific set of columns. If not provided, all columns are read. memory_map : boolean, default True Use memory mapping when opening file on disk Returns ------- table : pyarrow.Table """ reader = ext.FeatherReader() reader.open(source, use_memory_map=memory_map) if columns is None: return reader.read() column_types = [type(column) for column in columns] if all(map(lambda t: t == int, column_types)): table = reader.read_indices(columns) elif all(map(lambda t: t == str, column_types)): table = reader.read_names(columns) else: column_type_names = [t.__name__ for t in column_types] raise TypeError("Columns must be indices or names. " "Got columns {} of types {}" .format(columns, column_type_names)) # Feather v1 already respects the column selection if reader.version < 3: return table # Feather v2 reads with sorted / deduplicated selection elif sorted(set(columns)) == columns: return table else: # follow exact order / selection of names new_fields = [table.schema.field(c) for c in columns] new_schema = schema(new_fields, metadata=table.schema.metadata) new_columns = [table.column(c) for c in columns] return Table.from_arrays(new_columns, schema=new_schema)
def read_table(self, columns=None): if columns is not None: column_set = set(columns) else: column_set = None columns = [] names = [] for i in range(self.num_columns): name = self.get_column_name(i) if column_set is None or name in column_set: col = self.get_column(i) columns.append(col) names.append(name) table = Table.from_arrays(columns, names=names) return table
def write_feather(df, dest, compression=None, compression_level=None, chunksize=None, version=2): """ Write a pandas.DataFrame to Feather format. Parameters ---------- df : pandas.DataFrame or pyarrow.Table Data to write out as Feather format. dest : str Local destination path. compression : string, default None Can be one of {"zstd", "lz4", "uncompressed"}. The default of None uses LZ4 for V2 files if it is available, otherwise uncompressed. compression_level : int, default None Use a compression level particular to the chosen compressor. If None use the default compression level chunksize : int, default None For V2 files, the internal maximum size of Arrow RecordBatch chunks when writing the Arrow IPC file format. None means use the default, which is currently 64K version : int, default 2 Feather file version. Version 2 is the current. Version 1 is the more limited legacy format """ if _pandas_api.have_pandas: _check_pandas_version() if (_pandas_api.has_sparse and isinstance(df, _pandas_api.pd.SparseDataFrame)): df = df.to_dense() if _pandas_api.is_data_frame(df): table = Table.from_pandas(df, preserve_index=False) if version == 1: # Version 1 does not chunking for i, name in enumerate(table.schema.names): col = table[i] check_chunked_overflow(name, col) else: table = df if version == 1: if len(table.column_names) > len(set(table.column_names)): raise ValueError("cannot serialize duplicate column names") if compression is not None: raise ValueError("Feather V1 files do not support compression " "option") if chunksize is not None: raise ValueError("Feather V1 files do not support chunksize " "option") else: if compression is None and Codec.is_available('lz4_frame'): compression = 'lz4' elif (compression is not None and compression not in _FEATHER_SUPPORTED_CODECS): raise ValueError('compression="{}" not supported, must be ' 'one of {}'.format(compression, _FEATHER_SUPPORTED_CODECS)) try: ext.write_feather(table, dest, compression=compression, compression_level=compression_level, chunksize=chunksize, version=version) except Exception: if isinstance(dest, str): try: os.remove(dest) except os.error: pass raise