Beispiel #1
0
    def _get_null_counts(self, path, columns=None):
        reader = FeatherReader(path)
        counts = []
        for i in range(reader.num_columns):
            col = reader.get_column(i)
            if columns is None or col.name in columns:
                counts.append(col.null_count)

        return counts
Beispiel #2
0
    def test_num_columns_attr(self):
        df0 = pd.DataFrame({})
        df1 = pd.DataFrame({
            'foo': [1, 2, 3, 4, 5]
        })
        df2 = pd.DataFrame({
            'foo': [1, 2, 3, 4, 5],
            'bar': [1, 2, 3, 4, 5]
        })
        for df, ncols in zip([df0, df1, df2], [0, 1, 2]):
            path = random_path()
            self.test_files.append(path)
            write_feather(df, path)

            reader = FeatherReader(path)
            assert reader.num_columns == ncols
Beispiel #3
0
 def test_file_not_exist(self):
     with self.assertRaises(pa.ArrowIOError):
         FeatherReader('test_invalid_file')
Beispiel #4
0
 def test_file_not_exist(self):
     with pytest.raises(pa.ArrowIOError):
         FeatherReader('test_invalid_file')
Beispiel #5
0
 def load(self, gs: Type[GeneSignature]) -> pd.DataFrame:
     return FeatherReader(self._fname).read_pandas(
         columns=(INDEX_NAME, ) + gs.genes).set_index(INDEX_NAME)
Beispiel #6
0
 def load_full(self) -> pd.DataFrame:
     return FeatherReader(self._fname).read_pandas().set_index(INDEX_NAME)
Beispiel #7
0
 def genes(self) -> Tuple[str]:
     # noinspection PyTypeChecker
     reader = FeatherReader(self._fname)
     return tuple(
         reader.get_column_name(idx) for idx in range(self.total_genes)
         if reader.get_column_name(idx) != INDEX_NAME)
Beispiel #8
0
 def total_genes(self) -> int:
     return FeatherReader(self._fname).num_columns - 1
Beispiel #9
0
Datei: io.py Projekt: xrmx/modin
    def read_feather(cls, path, columns=None, use_threads=True):
        """Read a pandas.DataFrame from Feather format.
           Ray DataFrame only supports pyarrow engine for now.

        Args:
            path: The filepath of the feather file.
                  We only support local files for now.
                multi threading is set to True by default
            columns: not supported by pandas api, but can be passed here to read only
                specific columns
            use_threads: Whether or not to use threads when reading

        Notes:
            pyarrow feather is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/api.html#feather-format
        """
        if cls.read_feather_remote_task is None:
            return super(RayIO, cls).read_feather(
                path, columns=columns, use_threads=use_threads
            )

        if columns is None:
            from pyarrow.feather import FeatherReader

            fr = FeatherReader(path)
            columns = [fr.get_column_name(i) for i in range(fr.num_columns)]

        from modin.pandas import DEFAULT_NPARTITIONS

        num_partitions = DEFAULT_NPARTITIONS
        num_splits = min(len(columns), num_partitions)
        # Each item in this list will be a list of column names of the original df
        column_splits = (
            len(columns) // num_partitions
            if len(columns) % num_partitions == 0
            else len(columns) // num_partitions + 1
        )
        col_partitions = [
            columns[i : i + column_splits]
            for i in range(0, len(columns), column_splits)
        ]
        blk_partitions = np.array(
            [
                cls.read_feather_remote_task._remote(
                    args=(path, cols, num_splits), num_return_vals=num_splits + 1
                )
                for cols in col_partitions
            ]
        ).T
        remote_partitions = np.array(
            [
                [cls.frame_partition_cls(obj) for obj in row]
                for row in blk_partitions[:-1]
            ]
        )
        index_len = ray.get(blk_partitions[-1][0])
        index = pandas.RangeIndex(index_len)
        new_query_compiler = cls.query_compiler_cls(
            cls.frame_cls(remote_partitions, index, columns)
        )
        return new_query_compiler
Beispiel #10
0
 def load_full(self) -> pd.DataFrame:
     df = FeatherReader(self._fname).read_pandas()
     # Avoid copying the whole dataframe by replacing the index in place.
     # This makes loading a database twice as fast in case the database file is already in the filesystem cache.
     df.set_index(INDEX_NAME, inplace=True)
     return df
Beispiel #11
0
 def total_genes(self) -> int:
     # Do not count column 1 as it contains the index with the name of the features.
     return FeatherReader(self._fname).num_columns - 1