Example #1
0
    def _read_parquet_with_type_promotion_override(chunk, columns, parquet_engine):
        """
        This wrapper function of pd.read_parquet() is a hack intended to fix the type promotion problem
        when using fastparquet as the underlying parquet engine.

        When using fastparquet, boolean columns containing None values will be promoted to float16 columns.
        This behavior is inconsistent with what Pandas and Pyarrow does, which promote such columns
        to object columns. This becomes problematic when users want to write the dataframe back into parquet
        file because float16 (halffloat) is not a supported type in parquet spec. In this function, we detect
        such columns and do override the type promotion.
        """
        df = None

        if parquet_engine == 'fastparquet':
            from fastparquet import ParquetFile as _ParquetFile
            import fastparquet.thrift_structures as _ts

            # https://github.com/dask/fastparquet/issues/414#issuecomment-478983811
            df = _pd.read_parquet(chunk, columns=columns, engine=parquet_engine, index=False)
            df_column_types = df.dtypes
            pf = _ParquetFile(chunk)
            schema_column_dtypes = {l.name: l.type for l in list(pf.schema.schema_elements)}

            for idx in df_column_types[df_column_types == 'float16'].index.tolist():
                # A hacky way to get the string representations of the column types of a parquet schema
                # Reference:
                # https://github.com/dask/fastparquet/blob/f4ecc67f50e7bf98b2d0099c9589c615ea4b06aa/fastparquet/schema.py
                if _ts.parquet_thrift.Type._VALUES_TO_NAMES[schema_column_dtypes[idx]] == "BOOLEAN":
                    df[idx] = df[idx].astype('object')
                    df[idx].replace({0: False, 1: True, _pd.np.nan: None}, inplace=True)

        else:
            df = _pd.read_parquet(chunk, columns=columns, engine=parquet_engine)

        return df
Example #2
0
    def _read(self, chunk: os.PathLike, columns: typing.List[str],
              **kwargs) -> pandas.DataFrame:
        from fastparquet import ParquetFile as _ParquetFile
        from fastparquet import thrift_structures as _ts

        # TODO Follow up to figure out if this is not needed anymore
        # https://github.com/dask/fastparquet/issues/414#issuecomment-478983811
        df = pandas.read_parquet(chunk,
                                 columns=columns,
                                 engine=self.PARQUET_ENGINE,
                                 index=False)
        df_column_types = df.dtypes
        pf = _ParquetFile(chunk)
        schema_column_dtypes = {
            l.name: l.type
            for l in list(pf.schema.schema_elements)
        }

        for idx in df_column_types[df_column_types ==
                                   "float16"].index.tolist():
            # A hacky way to get the string representations of the column types of a parquet schema
            # Reference:
            # https://github.com/dask/fastparquet/blob/f4ecc67f50e7bf98b2d0099c9589c615ea4b06aa/fastparquet/schema.py
            if _ts.parquet_thrift.Type._VALUES_TO_NAMES[
                    schema_column_dtypes[idx]] == "BOOLEAN":
                df[idx] = df[idx].astype("object")
                df[idx].replace({
                    0: False,
                    1: True,
                    pandas.np.nan: None
                },
                                inplace=True)
        return df
Example #3
0
 def _read(self, chunk: os.PathLike, columns: typing.List[str],
           **kwargs) -> pandas.DataFrame:
     return pandas.read_parquet(chunk,
                                columns=columns,
                                engine=self.PARQUET_ENGINE,
                                **kwargs)