def _get_null_counts(self, path, columns=None): reader = FeatherReader(path) counts = [] for i in range(reader.num_columns): col = reader.get_column(i) if columns is None or col.name in columns: counts.append(col.null_count) return counts
def test_num_columns_attr(self): df0 = pd.DataFrame({}) df1 = pd.DataFrame({ 'foo': [1, 2, 3, 4, 5] }) df2 = pd.DataFrame({ 'foo': [1, 2, 3, 4, 5], 'bar': [1, 2, 3, 4, 5] }) for df, ncols in zip([df0, df1, df2], [0, 1, 2]): path = random_path() self.test_files.append(path) write_feather(df, path) reader = FeatherReader(path) assert reader.num_columns == ncols
def test_file_not_exist(self): with self.assertRaises(pa.ArrowIOError): FeatherReader('test_invalid_file')
def test_file_not_exist(self): with pytest.raises(pa.ArrowIOError): FeatherReader('test_invalid_file')
def load(self, gs: Type[GeneSignature]) -> pd.DataFrame: return FeatherReader(self._fname).read_pandas( columns=(INDEX_NAME, ) + gs.genes).set_index(INDEX_NAME)
def load_full(self) -> pd.DataFrame: return FeatherReader(self._fname).read_pandas().set_index(INDEX_NAME)
def genes(self) -> Tuple[str]: # noinspection PyTypeChecker reader = FeatherReader(self._fname) return tuple( reader.get_column_name(idx) for idx in range(self.total_genes) if reader.get_column_name(idx) != INDEX_NAME)
def total_genes(self) -> int: return FeatherReader(self._fname).num_columns - 1
def read_feather(cls, path, columns=None, use_threads=True): """Read a pandas.DataFrame from Feather format. Ray DataFrame only supports pyarrow engine for now. Args: path: The filepath of the feather file. We only support local files for now. multi threading is set to True by default columns: not supported by pandas api, but can be passed here to read only specific columns use_threads: Whether or not to use threads when reading Notes: pyarrow feather is used. Please refer to the documentation here https://arrow.apache.org/docs/python/api.html#feather-format """ if cls.read_feather_remote_task is None: return super(RayIO, cls).read_feather( path, columns=columns, use_threads=use_threads ) if columns is None: from pyarrow.feather import FeatherReader fr = FeatherReader(path) columns = [fr.get_column_name(i) for i in range(fr.num_columns)] from modin.pandas import DEFAULT_NPARTITIONS num_partitions = DEFAULT_NPARTITIONS num_splits = min(len(columns), num_partitions) # Each item in this list will be a list of column names of the original df column_splits = ( len(columns) // num_partitions if len(columns) % num_partitions == 0 else len(columns) // num_partitions + 1 ) col_partitions = [ columns[i : i + column_splits] for i in range(0, len(columns), column_splits) ] blk_partitions = np.array( [ cls.read_feather_remote_task._remote( args=(path, cols, num_splits), num_return_vals=num_splits + 1 ) for cols in col_partitions ] ).T remote_partitions = np.array( [ [cls.frame_partition_cls(obj) for obj in row] for row in blk_partitions[:-1] ] ) index_len = ray.get(blk_partitions[-1][0]) index = pandas.RangeIndex(index_len) new_query_compiler = cls.query_compiler_cls( cls.frame_cls(remote_partitions, index, columns) ) return new_query_compiler
def load_full(self) -> pd.DataFrame: df = FeatherReader(self._fname).read_pandas() # Avoid copying the whole dataframe by replacing the index in place. # This makes loading a database twice as fast in case the database file is already in the filesystem cache. df.set_index(INDEX_NAME, inplace=True) return df
def total_genes(self) -> int: # Do not count column 1 as it contains the index with the name of the features. return FeatherReader(self._fname).num_columns - 1