Esempio n. 1
0
def test_autocat(tempdir):
    tmp = str(tempdir)
    fn = os.path.join(tmp, "test.parq")
    data = pd.DataFrame(
        {'o': pd.Categorical(np.random.choice(['hello', 'world'], size=1000))})
    write(fn, data)
    pf = ParquetFile(fn)
    assert 'o' in pf.categories
    assert pf.categories['o'] == 2
    assert pf.dtypes['o'] == 'category'
    out = pf.to_pandas()
    assert out.dtypes['o'] == 'category'
    out = pf.to_pandas(categories={})
    assert str(out.dtypes['o']) != 'category'
    out = pf.to_pandas(categories=['o'])
    assert out.dtypes['o'] == 'category'
    out = pf.to_pandas(categories={'o': 2})
    assert out.dtypes['o'] == 'category'

    # regression test
    pf.fmd.key_value_metadata = [
        parquet_thrift.KeyValue(key='fastparquet.cats', value='{"o": 2}')
    ]
    pf._set_attrs()
    assert 'o' in pf.categories
    assert pf.categories['o'] == 2
    assert pf.dtypes['o'] == 'category'
    out = pf.to_pandas()
    assert out.dtypes['o'] == 'category'
    out = pf.to_pandas(categories={})
    assert str(out.dtypes['o']) != 'category'
    out = pf.to_pandas(categories=['o'])
    assert out.dtypes['o'] == 'category'
    out = pf.to_pandas(categories={'o': 2})
    assert out.dtypes['o'] == 'category'
Esempio n. 2
0
    def read_partition(cls, fs, piece, columns, index, categories=(),
                       **kwargs):

        null_index_name = False
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        # Use global `parquet_file` object.  Need to reattach
        # the desired row_group
        parquet_file = kwargs.pop("parquet_file", None)

        if isinstance(piece, tuple):
            if isinstance(piece[0], str):
                # We have a path to read from
                assert parquet_file is None
                parquet_file = ParquetFile(piece[0],
                                           open_with=fs.open,
                                           sep=fs.sep,
                                           **kwargs.get("file", {}))
                rg_indices = piece[1] or list(
                    range(len(parquet_file.row_groups)))

                # `piece[1]` will contain row-group indices
                row_groups = [parquet_file.row_groups[rg] for rg in rg_indices]
            elif parquet_file:
                # `piece[1]` will contain actual row-group objects,
                # but they may be pickled
                row_groups = piece[0]
                if isinstance(row_groups, bytes):
                    row_groups = pickle.loads(row_groups)
                parquet_file.fmd.row_groups = row_groups
                # NOTE: May lose cats after `_set_attrs` call
                save_cats = parquet_file.cats
                parquet_file._set_attrs()
                parquet_file.cats = save_cats
            else:
                raise ValueError("Neither path nor ParquetFile detected!")

            if null_index_name:
                if "__index_level_0__" in parquet_file.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index

            parquet_file._dtypes = (lambda *args: parquet_file.dtypes
                                    )  # ugly patch, could be fixed

            # Read necessary row-groups and concatenate
            dfs = []
            for row_group in row_groups:
                dfs.append(
                    parquet_file.read_row_group_file(
                        row_group,
                        columns,
                        categories,
                        index=index,
                        **kwargs.get("read", {}),
                    ))
            return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0]

        else:
            # `piece` is NOT a tuple
            raise ValueError(f"Expected tuple, got {type(piece)}")
Esempio n. 3
0
    def read_partition(
        cls,
        fs,
        pieces,
        columns,
        index,
        categories=(),
        root_cats=None,
        root_file_scheme=None,
        base_path=None,
        **kwargs,
    ):

        null_index_name = False
        base_path = False if not root_cats else base_path
        if isinstance(index, list):
            if index == [None]:
                # Handling a None-labeled index...
                # The pandas metadata told us to read in an index
                # labeled `None`. If this corresponds to a `RangeIndex`,
                # fastparquet will need use the pandas metadata to
                # construct the index. Otherwise, the index will correspond
                # to a column named "__index_level_0__".  We will need to
                # check the `ParquetFile` object for this column below.
                index = []
                null_index_name = True
            columns += index

        # Use global `parquet_file` object.  Need to reattach
        # the desired row_group
        parquet_file = kwargs.pop("parquet_file", None)

        # Always convert pieces to list
        if not isinstance(pieces, list):
            pieces = [pieces]

        sample = pieces[0]
        if isinstance(sample, tuple):
            if isinstance(sample[0], str):
                # We have paths to read from
                assert parquet_file is None

                row_groups = []
                rg_offset = 0
                parquet_file = ParquetFile(
                    [p[0] for p in pieces],
                    open_with=fs.open,
                    root=base_path or False,
                    **kwargs.get("file", {}),
                )
                for piece in pieces:
                    _pf = (parquet_file if len(pieces) == 1 else ParquetFile(
                        piece[0],
                        open_with=fs.open,
                        root=base_path or False,
                        **kwargs.get("file", {}),
                    ))
                    n_local_row_groups = len(_pf.row_groups)
                    local_rg_indices = piece[1] or list(
                        range(n_local_row_groups))
                    row_groups += [
                        parquet_file.row_groups[rg + rg_offset]
                        for rg in local_rg_indices
                    ]
                    rg_offset += n_local_row_groups
                update_parquet_file = len(row_groups) < len(
                    parquet_file.row_groups)

            elif parquet_file:

                row_groups = []
                for piece in pieces:
                    # `piece[1]` will contain actual row-group objects,
                    # but they may be pickled
                    rgs = piece[0]
                    if isinstance(rgs, bytes):
                        rgs = pickle.loads(rgs)
                    row_groups += rgs
                update_parquet_file = True

            else:
                raise ValueError("Neither path nor ParquetFile detected!")

            if update_parquet_file:
                with _FP_FILE_LOCK:
                    parquet_file.fmd.row_groups = row_groups
                    # NOTE: May lose cats after `_set_attrs` call
                    save_cats = parquet_file.cats
                    parquet_file._set_attrs()
                    parquet_file.cats = save_cats

            if null_index_name:
                if "__index_level_0__" in parquet_file.columns:
                    # See "Handling a None-labeled index" comment above
                    index = ["__index_level_0__"]
                    columns += index

            # Update hive-partitioning information if necessary
            parquet_file.cats = root_cats or {}
            if root_cats:
                parquet_file.file_scheme = root_file_scheme

            parquet_file._dtypes = (lambda *args: parquet_file.dtypes
                                    )  # ugly patch, could be fixed

            if set(columns).issubset(parquet_file.columns +
                                     list(parquet_file.cats.keys())):
                # Convert ParquetFile to pandas
                return parquet_file.to_pandas(
                    columns=columns,
                    categories=categories,
                    index=index,
                )
            else:
                # Read necessary row-groups and concatenate
                dfs = []
                for row_group in row_groups:
                    dfs.append(
                        parquet_file.read_row_group_file(
                            row_group,
                            columns,
                            categories,
                            index=index,
                            **kwargs.get("read", {}),
                        ))
                return concat(dfs, axis=0) if len(dfs) > 1 else dfs[0]

        else:
            # `sample` is NOT a tuple
            raise ValueError(f"Expected tuple, got {type(sample)}")