Example #1
0
 def test_astype(self):
     pdf, psdf = self.pdf, self.psdf
     for col in self.numeric_df_cols:
         pser, psser = pdf[col], psdf[col]
         self.assert_eq(pser.astype(int), psser.astype(int))
         self.assert_eq(pser.astype(float), psser.astype(float))
         self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
         self.assert_eq(pser.astype(np.int32), psser.astype(np.int32))
         self.assert_eq(pser.astype(np.int16), psser.astype(np.int16))
         self.assert_eq(pser.astype(np.int8), psser.astype(np.int8))
         self.assert_eq(pser.astype(str), psser.astype(str))
         self.assert_eq(pser.astype(bool), psser.astype(bool))
         self.assert_eq(pser.astype("category"), psser.astype("category"))
         cat_type = CategoricalDtype(categories=[2, 1, 3])
         self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
     self.assertRaisesRegex(
         ValueError,
         "Cannot convert fractions with missing values to integer",
         lambda: self.float_withnan_psser.astype(int),
     )
     self.assertRaisesRegex(
         ValueError,
         "Cannot convert fractions with missing values to integer",
         lambda: self.float_withnan_psser.astype(np.int32),
     )
     self.assert_eq(self.float_withnan_psser.astype(str), self.float_withnan_psser.astype(str))
     self.assert_eq(self.float_withnan_psser.astype(bool), self.float_withnan_psser.astype(bool))
     self.assert_eq(
         self.float_withnan_psser.astype("category"), self.float_withnan_psser.astype("category")
     )
     if extension_object_dtypes_available and extension_float_dtypes_available:
         pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype())
         psser = ps.from_pandas(pser)
         self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype()))
Example #2
0
    def test_astype(self):
        pdf, psdf = self.pdf, self.psdf
        for col in self.numeric_df_cols:
            pser, psser = pdf[col], psdf[col]

            for int_type in [int, np.int32, np.int16, np.int8]:
                if not pser.hasnans:
                    self.assert_eq(pser.astype(int_type),
                                   psser.astype(int_type))
                else:
                    self.assertRaisesRegex(
                        ValueError,
                        "Cannot convert %s with missing "
                        "values to integer" % psser._dtype_op.pretty_name,
                        lambda: psser.astype(int_type),
                    )

            # TODO(SPARK-37039): the np.nan series.astype(bool) should be True
            if not pser.hasnans:
                self.assert_eq(pser.astype(bool), psser.astype(bool))

            self.assert_eq(pser.astype(float), psser.astype(float))
            self.assert_eq(pser.astype(np.float32), psser.astype(np.float32))
            self.assert_eq(pser.astype(str), psser.astype(str))
            self.assert_eq(pser.astype("category"), psser.astype("category"))
            cat_type = CategoricalDtype(categories=[2, 1, 3])
            self.assert_eq(pser.astype(cat_type), psser.astype(cat_type))
        if extension_object_dtypes_available and extension_float_dtypes_available:
            pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]),
                             dtype=pd.Float64Dtype())
            psser = ps.from_pandas(pser)
            self.assert_eq(pser.astype(pd.BooleanDtype()),
                           psser.astype(pd.BooleanDtype()))
def test_boolean_conversion(s, expected_category, bool_map):
    assert _infer_bool_type(s) == expected_category

    if pd.isna(s[2]):
        expected = pd.Series([True, False, pd.NA], dtype=pd.BooleanDtype())
    else:
        expected = pd.Series([True, False, True], dtype=pd.BooleanDtype())
    actual = convert_to_bool_series(s, True, bool_map=bool_map)
    assert_series_equal(expected, actual)
def test_meta():
    values = pd.array([True, False, None], dtype="boolean")
    ds = dd.from_pandas(pd.Series(values), 2)
    assert ds.dtype == pd.BooleanDtype()

    dd.utils.assert_eq(ds._meta_nonempty, pd.Series([True, pd.NA], dtype="boolean"))

    ddf = dd.from_pandas(pd.DataFrame({"A": values}), 2)
    assert ddf.dtypes["A"] == pd.BooleanDtype()

    dd.utils.assert_eq(
        ddf._meta_nonempty,
        pd.DataFrame({"A": pd.array([True, pd.NA], dtype="boolean")}),
    )
Example #5
0
    def test_decide_pandas_dtype(self):
        base_arr = np.arange(4)

        arr = base_arr.reshape(2, 2)
        res = decide_pandas_dtype(arr)
        self.assertEqual(res, object)

        class NotCoveredType:
            pass

        mapping = {
            object: object,
            bool: pd.BooleanDtype(),
            str: pd.StringDtype(),
            float: float,
            int: "Int64",
            np.int8: "Int8",
            np.uint16: "UInt16",
            NotCoveredType: object
        }

        for tin, tout in mapping.items():
            arr = base_arr.astype(tin)
            res = decide_pandas_dtype(arr)
            self.assertEqual(res, tout)
Example #6
0
def pyarrow2pandas_extension(  # pylint: disable=too-many-branches,too-many-return-statements
    dtype: pa.DataType,
) -> Optional[pd.api.extensions.ExtensionDtype]:
    """Pyarrow to Pandas data types conversion."""
    if pa.types.is_int8(dtype):
        return pd.Int8Dtype()
    if pa.types.is_int16(dtype):
        return pd.Int16Dtype()
    if pa.types.is_int32(dtype):
        return pd.Int32Dtype()
    if pa.types.is_int64(dtype):
        return pd.Int64Dtype()
    if pa.types.is_uint8(dtype):
        return pd.UInt8Dtype()
    if pa.types.is_uint16(dtype):
        return pd.UInt16Dtype()
    if pa.types.is_uint32(dtype):
        return pd.UInt32Dtype()
    if pa.types.is_uint64(dtype):
        return pd.UInt64Dtype()
    if pa.types.is_boolean(dtype):
        return pd.BooleanDtype()
    if pa.types.is_string(dtype):
        return pd.StringDtype()
    return None
Example #7
0
def test_pandas_extension_types():
    """Test pandas extension data type happy path."""
    # pylint: disable=no-member
    test_params = [
        (pd.CategoricalDtype(),
         pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None),
        (pd.DatetimeTZDtype(tz='UTC'),
         pd.Series(pd.date_range(start="20200101", end="20200301"),
                   dtype="datetime64[ns, utc]"), None),
        (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None),
        (pd.StringDtype(), pd.Series(["foo", "bar", "baz"],
                                     dtype="string"), None),
        (pd.PeriodDtype(freq='D'),
         pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None),
        (
            pd.SparseDtype("float"),
            pd.Series(range(100)).where(lambda s: s < 5,
                                        other=np.nan).astype("Sparse[float]"),
            {
                "nullable": True
            },
        ),
        (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None),
        (
            pd.IntervalDtype(subtype="int64"),
            pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])),
            None,
        )
    ]
    for dtype, data, series_kwargs in test_params:
        series_kwargs = {} if series_kwargs is None else series_kwargs
        series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs)
        assert isinstance(series_schema.validate(data), pd.Series)
Example #8
0
    def test_null_values(self):
        data = np.array([["dadf", 10.9, -2, False], ["c", 100.9, 1, True],
                         [None, None, None, None]])

        df = pd.DataFrame({
            "col1": data[:, 0],
            "col2": data[:, 1],
            "col3": data[:, 2],
            "col4": data[:, 3]
        })
        inOp = dataframeToOperator(
            df,
            schemaStr='col1 string, col2 double, col3 long, col4 boolean',
            op_type='batch')
        res = inOp.collectToDataframe()
        print(res.dtypes)
        print(res)
        self.assertEqual(res.dtypes[0], pd.StringDtype())
        self.assertEqual(res.dtypes[1], np.float64)
        self.assertEqual(res.dtypes[2], pd.Int64Dtype())
        self.assertEqual(res.dtypes[3], pd.BooleanDtype())
        self.assertTrue(pd.isna(res["col1"][2]))
        self.assertTrue(pd.isna(res["col2"][2]))
        self.assertTrue(pd.isna(res["col3"][2]))
        self.assertTrue(pd.isna(res["col4"][2]))
Example #9
0
    def test_memory(self):
        from pyalink.alink.config import g_config
        g_config["collect_storage_type"] = "memory"
        schema = "f_string string,f_long long,f_int int,f_double double,f_boolean boolean"
        op = dataframeToOperator(self.df, schemaStr=schema, op_type="batch")

        col_names = op.getColNames()
        col_types = op.getColTypes()
        self.assertEqual(col_names[0], "f_string")
        self.assertEqual(col_names[1], "f_long")
        self.assertEqual(col_names[2], "f_int")
        self.assertEqual(col_names[3], "f_double")
        self.assertEqual(col_names[4], "f_boolean")

        self.assertEqual(col_types[0], "VARCHAR")
        self.assertEqual(col_types[1], "BIGINT")
        self.assertEqual(col_types[2], "INT")
        self.assertEqual(col_types[3], "DOUBLE")
        self.assertEqual(col_types[4], "BOOLEAN")

        df2 = op.collectToDataframe()
        print(df2)
        print(df2.dtypes)
        self.assertEqual(df2['f_string'].dtype, pd.StringDtype())
        self.assertEqual(df2['f_long'].dtype, pd.Int64Dtype())
        self.assertEqual(df2['f_int'].dtype, pd.Int32Dtype())
        self.assertEqual(df2['f_double'].dtype, np.float64)
        self.assertEqual(df2['f_boolean'].dtype, pd.BooleanDtype())
Example #10
0
    def _dtypes(self, categories=None):
        """ Implied types of the columns in the schema """
        import pandas as pd
        if self.has_pandas_metadata:
            md = self.pandas_metadata['columns']
            tz = {
                c['name']: c['metadata']['timezone']
                for c in md
                if (c.get('metadata', {}) or {}).get('timezone', None)
            }
        else:
            tz = None
        self.tz = tz
        categories = self.check_categories(categories)
        dtype = OrderedDict(
            (name, (converted_types.typemap(f) if f.num_children in
                    [None, 0] else np.dtype("O")))
            for name, f in self.schema.root.children.items()
            if getattr(f, 'isflat', False) is False)

        for i, (col, dt) in enumerate(dtype.copy().items()):
            if dt.kind in ['i', 'b']:
                # int/bool columns that may have nulls become float columns
                num_nulls = 0
                for rg in self.row_groups:
                    chunk = rg.columns[i]
                    if chunk.meta_data.statistics is None:
                        num_nulls = True
                        break
                    if chunk.meta_data.statistics.null_count is None:
                        num_nulls = True
                        break
                    if chunk.meta_data.statistics.null_count:
                        num_nulls = True
                        break
                if num_nulls:
                    if dt.kind == "b":
                        dtype[col] = pd.BooleanDtype()
                    elif dtype[col].itemsize == 1:
                        dtype[col] = pd.Int8Dtype()
                    elif dtype[col].itemsize == 2:
                        dtype[col] = pd.Int16Dtype()
                    elif dtype[col].itemsize == 4:
                        dtype[col] = pd.Int32Dtype()
                    else:
                        dtype[col] = pd.Int64Dtype()

            elif dt.kind == "M":
                if tz is not None and tz.get(col, False):
                    dtype[col] = pd.Series([], dtype='M8[ns]').dt.tz_localize(
                        tz[col]).dtype
            elif dt == 'S12':
                dtype[col] = 'M8[ns]'
        for field in categories:
            dtype[field] = 'category'
        for cat in self.cats:
            dtype[cat] = "category"
        self.dtypes = dtype
        return dtype
Example #11
0
def test_bucketing_parquet_dataset(path, glue_database, glue_table,
                                   bucketing_data, dtype):
    # Skip invalid combinations of data and data types
    if type(bucketing_data[0]) == int and "int" not in dtype.lower():
        pytest.skip()
    if type(bucketing_data[0]) == bool and "bool" not in dtype.lower():
        pytest.skip()
    if type(bucketing_data[0]) == str and (dtype != "string"
                                           or dtype != "object"):
        pytest.skip()

    nb_of_buckets = 2
    df = pd.DataFrame({"c0": bucketing_data, "c1": ["foo", "bar", "baz"]})
    r = wr.s3.to_parquet(
        df=df,
        path=path,
        database=glue_database,
        table=glue_table,
        dataset=True,
        mode="overwrite",
        bucketing_info=(["c0"], nb_of_buckets),
    )

    assert len(r["paths"]) == 2
    assert r["paths"][0].endswith("bucket-00000.snappy.parquet")
    assert r["paths"][1].endswith("bucket-00001.snappy.parquet")

    dtype = None
    if isinstance(bucketing_data[0], int):
        dtype = pd.Int64Dtype()
    if isinstance(bucketing_data[0], bool):
        dtype = pd.BooleanDtype()
    if isinstance(bucketing_data[0], str):
        dtype = pd.StringDtype()

    first_bucket_df = wr.s3.read_parquet(path=[r["paths"][0]])
    assert len(first_bucket_df) == 2
    assert pd.Series([bucketing_data[0], bucketing_data[2]],
                     dtype=dtype).equals(first_bucket_df["c0"])
    assert pd.Series(["foo", "baz"],
                     dtype=pd.StringDtype()).equals(first_bucket_df["c1"])

    second_bucket_df = wr.s3.read_parquet(path=[r["paths"][1]])
    assert len(second_bucket_df) == 1
    assert pd.Series([bucketing_data[1]],
                     dtype=dtype).equals(second_bucket_df["c0"])
    assert pd.Series(["bar"],
                     dtype=pd.StringDtype()).equals(second_bucket_df["c1"])

    loaded_dfs = [
        wr.s3.read_parquet(path=path),
        wr.athena.read_sql_table(table=glue_table,
                                 database=glue_database,
                                 ctas_approach=False),
    ]

    for loaded_df in loaded_dfs:
        assert len(loaded_df) == 3
        assert all(x in bucketing_data for x in loaded_df["c0"].to_list())
Example #12
0
 def test_nullable_types(self):
     df = pd.DataFrame({"tag1": [10, None], "tag2": [True, None]})
     df1 = df.astype({"tag1": "Int64", "tag2": pd.BooleanDtype()})
     push("test/pandas/nullable_types",
          df1,
          encoder=DataFrameEncoder(index=False))
     df2 = pull("test/pandas/nullable_types")
     map(lambda x, y: self.assertEqual(x, y), zip(df2.dtypes, df1.dtypes))
Example #13
0
def test_astype_to_boolean_array():
    # astype to BooleanArray
    arr = pd.array([True, False, None], dtype="boolean")

    result = arr.astype("boolean")
    tm.assert_extension_array_equal(result, arr)
    result = arr.astype(pd.BooleanDtype())
    tm.assert_extension_array_equal(result, arr)
Example #14
0
def test_astype_to_boolean_array():
    # astype to BooleanArray
    arr = pd.array([0.0, 1.0, None], dtype="Float64")

    result = arr.astype("boolean")
    expected = pd.array([False, True, None], dtype="boolean")
    tm.assert_extension_array_equal(result, expected)
    result = arr.astype(pd.BooleanDtype())
    tm.assert_extension_array_equal(result, expected)
Example #15
0
    def __init__(self, pandas_obj):
        # validate and assign object
        self._validate(pandas_obj)
        self._obj = pandas_obj

        # define incorporated modules - columns consisting of others will not have the dtype changed
        self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas']

        # define a possible list of null values
        self._NULL_VALS = [
            None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf,
            '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk',
            'UNKNOWN', 'UNK'
        ]

        # assign dtypes and limits
        # boolean
        BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on']
        BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off']
        self._BOOL_MAP_DICT = {i: True
                               for i in BOOL_STRINGS_TRUE
                               }.update({i: False
                                         for i in BOOL_STRINGS_FALSE})
        self._DTYPE_BOOL_BASE = np.bool
        self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype()
        # unsigned integers - base and nullable
        self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64]
        self._DTYPES_UINT_NULLABLE = [
            pd.UInt8Dtype(),
            pd.UInt16Dtype(),
            pd.UInt32Dtype(),
            pd.UInt64Dtype()
        ]
        self._LIMIT_LOW_UINT = [
            np.iinfo(i).min for i in self._DTYPES_UINT_BASE
        ]
        self._LIMIT_HIGH_UINT = [
            np.iinfo(i).max for i in self._DTYPES_UINT_BASE
        ]
        # signed integers - base and nullable
        self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64]
        self._DTYPES_INT_NULLABLE = [
            pd.Int8Dtype(),
            pd.Int16Dtype(),
            pd.Int32Dtype(),
            pd.Int64Dtype()
        ]
        self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE]
        self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE]
        # floats - nullable by default
        self._DTYPES_FLOAT = [np.float16, np.float32, np.float64]
        # datetime - nullable by default
        self._DTYPE_DATETIME = np.datetime64
        # string
        self._DTYPE_STRING = pd.StringDtype()
        # categorical - nullable by default
        self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
Example #16
0
def load_audit(name):
	df = load_csv(name)
	df = df.where((pandas.notnull(df)), None)
	df["Adjusted"] = df["Adjusted"].astype(int)
	df["Age"] = df["Age"].astype(pandas.Int64Dtype() if name.endswith("NA") else int)
	df["Deductions"] = df["Deductions"].astype(pandas.BooleanDtype() if name.endswith("NA") else bool)
	df["Income"] = df["Income"].astype(float)
	df["Hours"] = df["Hours"].astype(float)
	return split_csv(df)
Example #17
0
 def test_dtypes_are_correct_after_groupby_last(self, data):
     # GH46409
     df = DataFrame({
         "id": [1, 2, 3, 4],
         "test": [True, pd.NA, data, False]
     }).convert_dtypes()
     result = df.groupby("id").last().test
     expected = df.set_index("id").test
     assert result.dtype == pd.BooleanDtype()
     tm.assert_series_equal(expected, result)
Example #18
0
def get_sparse_series() -> Dict[str, pd.Series]:
    test_series = {
        "int_sparse":
        pd.Series([-1, 0, 1, 2, 3], dtype=pd.SparseDtype(np.int32, 0)),
        "float_sparse":
        pd.Series(
            [np.nan, 0, 1, 2, 3],
            dtype=pd.SparseDtype(np.float64, np.nan),
        ),
        "complex_sparse":
        pd.Series(
            [
                np.nan,
                complex(0, 1),
                complex(1, -1),
                complex(2, 4),
                complex(3, -12)
            ],
            dtype=pd.SparseDtype(np.complex128, np.nan),
        ),
        "bool_sparse":
        pd.Series(
            [True, False, False],
            dtype=pd.SparseDtype(np.bool, False),
        ),
        "str_obj_sparse":
        pd.Series(
            pd.arrays.SparseArray([None, None, "gold", "black", "silver"]), ),
        # Pending https://github.com/pandas-dev/pandas/issues/35762
        # pd.Series([NoneT, 0, 1, 2, 3, 4], name="datetime_sparse", dtype=pd.SparseDtype(np.datetime64)),
        # Pandas dtypes
        "pd_int64_sparse":
        pd.Series(
            [0, 1, 2, 3, None],
            dtype=pd.SparseDtype(pd.Int64Dtype()),
        ),
        # Pending https://github.com/pandas-dev/pandas/issues/35793
        # pd.Series(
        #     ["a", "b", "c", None],
        #     name="pd_categorical_sparse",
        #     dtype=pd.SparseDtype(pd.CategoricalDtype(['a', 'b', 'c', 'd']))
        # )
    }

    if pandas_version[0] >= 1 and not_pandas_1_0_5:
        test_series["pd_string_sparse"] = pd.Series(
            ["Patty", "Valentine", "Upper", "", "", ""],
            dtype=pd.SparseDtype(pd.StringDtype(), ""),
        )
        test_series["pd_bool_sparse"] = pd.Series(
            [True, False, False, None],
            dtype=pd.SparseDtype(pd.BooleanDtype(), None),
        )

    return test_series
Example #19
0
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ):
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            if LooseVersion(self.api.__version__) >= "0.16":
                import pandas as pd

                mapping = {
                    self.api.int8(): pd.Int8Dtype(),
                    self.api.int16(): pd.Int16Dtype(),
                    self.api.int32(): pd.Int32Dtype(),
                    self.api.int64(): pd.Int64Dtype(),
                    self.api.uint8(): pd.UInt8Dtype(),
                    self.api.uint16(): pd.UInt16Dtype(),
                    self.api.uint32(): pd.UInt32Dtype(),
                    self.api.uint64(): pd.UInt64Dtype(),
                    self.api.bool_(): pd.BooleanDtype(),
                    self.api.string(): pd.StringDtype(),
                }
                to_pandas_kwargs["types_mapper"] = mapping.get
            else:
                raise ValueError(
                    "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 "
                    f"({self.api.__version__} is installed"
                )
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns, **kwargs
            ).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
Example #20
0
def test_is_bool_dtype():
    assert not com.is_bool_dtype(int)
    assert not com.is_bool_dtype(str)
    assert not com.is_bool_dtype(pd.Series([1, 2]))
    assert not com.is_bool_dtype(np.array(["a", "b"]))
    assert not com.is_bool_dtype(pd.Index(["a", "b"]))

    assert com.is_bool_dtype(bool)
    assert com.is_bool_dtype(np.bool)
    assert com.is_bool_dtype(np.array([True, False]))
    assert com.is_bool_dtype(pd.Index([True, False]))

    assert com.is_bool_dtype(pd.BooleanDtype())
    assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean"))
Example #21
0
    def read(
        self,
        path,
        columns=None,
        use_nullable_dtypes=False,
        storage_options: StorageOptions = None,
        **kwargs,
    ) -> DataFrame:
        kwargs["use_pandas_metadata"] = True

        to_pandas_kwargs = {}
        if use_nullable_dtypes:
            import pandas as pd

            mapping = {
                self.api.int8(): pd.Int8Dtype(),
                self.api.int16(): pd.Int16Dtype(),
                self.api.int32(): pd.Int32Dtype(),
                self.api.int64(): pd.Int64Dtype(),
                self.api.uint8(): pd.UInt8Dtype(),
                self.api.uint16(): pd.UInt16Dtype(),
                self.api.uint32(): pd.UInt32Dtype(),
                self.api.uint64(): pd.UInt64Dtype(),
                self.api.bool_(): pd.BooleanDtype(),
                self.api.string(): pd.StringDtype(),
                self.api.float32(): pd.Float32Dtype(),
                self.api.float64(): pd.Float64Dtype(),
            }
            to_pandas_kwargs["types_mapper"] = mapping.get
        manager = get_option("mode.data_manager")
        if manager == "array":
            to_pandas_kwargs["split_blocks"] = True  # type: ignore[assignment]

        path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle(
            path,
            kwargs.pop("filesystem", None),
            storage_options=storage_options,
            mode="rb",
        )
        try:
            result = self.api.parquet.read_table(
                path_or_handle, columns=columns,
                **kwargs).to_pandas(**to_pandas_kwargs)
            if manager == "array":
                result = result._as_manager("array", copy=False)
            return result
        finally:
            if handles is not None:
                handles.close()
Example #22
0
 def test_numeric_nullable_dtypes(self):
     dtypes = [
         pd.StringDtype(),
         pd.BooleanDtype(),
         pd.Float64Dtype(),
         pd.Float32Dtype(),
         pd.Int64Dtype(),
         pd.UInt64Dtype(),
         pd.Int32Dtype(),
         pd.UInt32Dtype(),
         pd.Int16Dtype(),
         pd.UInt16Dtype(),
         pd.Int8Dtype(),
         pd.UInt8Dtype(),
         pd.StringDtype(),
     ]
     # TODO: Re-add (".xml", "xml"),
     # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46
     for suffix, fn in [
         (".snappy", "parquet"),
         (".feather", "feather"),
         (".csv", "csv"),
         (".tsv", "tsv"),
         (".json", "json"),
         (".xlsx", "xlsx"),
         (".xls", "xls"),
         (".xlsb", "xlsb"),
         (".ods", "ods"),
         (".pickle", "pickle"),
     ]:
         # TODO: include xml
         for dtype in dtypes:
             with tmpfile(suffix) as path:
                 try:
                     df = Ind2Col2.convert(
                         Ind2Col2(
                             sample_data_ind2_col2_pd_na())).astype(dtype)
                     assert list(df.index.names) == ["qqq", "rrr"]
                     assert list(df.columns) == ["abc", "xyz"]
                     getattr(df, "to_" + fn)(path)
                     df2 = getattr(Ind2Col2, "read_" + fn)(path)
                     assert list(df2.index.names) == ["qqq", "rrr"]
                     assert list(df2.columns) == ["abc", "xyz"]
                 except Exception:
                     logger.error(f"Failed on path {path}, dtype {dtype}")
                     raise
Example #23
0
def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]
    else:
        data = [1, pd.NA, 3, pd.NA, 5]

    pd_data = pd.Series(data, dtype=pd_dtype)
    gd_data = cudf.Series.from_pandas(pd_data)

    assert gd_data.dtype == expect_dtype

    # check mask
    expect_mask = [True if x is not pd.NA else False for x in pd_data]
    got_mask = mask_to_bools(gd_data._column.base_mask, 0,
                             len(gd_data)).to_array()

    np.testing.assert_array_equal(expect_mask, got_mask)
Example #24
0
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
    if pd_dtype == pd.StringDtype():
        data = ["a", pd.NA, "c", pd.NA, "e"]
    elif pd_dtype == pd.BooleanDtype():
        data = [True, pd.NA, False, pd.NA, True]
    else:
        data = [1, pd.NA, 3, pd.NA, 5]

    pd_data = pd.DataFrame.from_dict({"a": data}, dtype=pd_dtype)
    gd_data = cudf.DataFrame.from_pandas(pd_data)

    assert gd_data["a"].dtype == expect_dtype

    # check mask
    expect_mask = [True if x is not pd.NA else False for x in pd_data["a"]]
    got_mask = mask_to_bools(
        gd_data["a"]._column.base_mask, 0, len(gd_data)
    ).values_host

    np.testing.assert_array_equal(expect_mask, got_mask)
Example #25
0
def test_orc_read_skiprows(tmpdir):
    buff = BytesIO()
    df = pd.DataFrame(
        {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]},
        dtype=pd.BooleanDtype(),
    )
    writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean()))
    tuples = list(
        map(
            lambda x: (None, ) if x[0] is pd.NA else x,
            list(df.itertuples(index=False, name=None)),
        ))
    writer.writerows(tuples)
    writer.close()

    skiprows = 10

    expected = cudf.read_orc(buff)[skiprows::].reset_index(drop=True)
    got = cudf.read_orc(buff, skiprows=skiprows)

    assert_eq(expected, got)
def get_dummy_deltas(employees_path):
    employees = pd.read_csv(employees_path, nrows=10)

    # Create deleted flag
    employees["record_deleted"] = False
    employees["record_deleted"] = employees["record_deleted"].astype(
        pd.BooleanDtype())

    # Cast to new int cols
    for col in ["employee_id", "department_id", "manager_id"]:
        employees[col] = employees[col].astype(pd.Int64Dtype())

    # Cast to new str cols
    for col in ["sex", "forename", "surname"]:
        employees[col] = employees[col].astype(pd.StringDtype())

    # Let's split up the data and make some changes
    day1 = employees[employees.employee_id.isin([1, 2, 3, 4,
                                                 5])].reset_index(drop=True)

    day2 = employees[employees.employee_id.isin([5, 6,
                                                 7])].reset_index(drop=True)
    day2.loc[0, "department_id"] = 2
    day2.loc[0, "manager_id"] = 18

    day3 = employees[employees.employee_id.isin([1, 7, 9, 10,
                                                 11])].reset_index(drop=True)
    day3.department_id = 2
    day3.manager_id = 5

    # Reset this persons values for clarity
    day3.loc[0, "record_deleted"] = True
    day3.loc[0, "department_id"] = 1
    day3.loc[0, "manager_id"] = 17

    deltas = {"day1": day1, "day2": day2, "day3": day3}

    return deltas
Example #27
0
def test_df_pdv1_types():
    pdv1_test_mapping = {
        'int8col': {
            'vals': [1, 2, 3],
            'pd_type': pd.Int8Dtype()
        },
        'int16col': {
            'vals': [1, 2, 3],
            'pd_type': pd.Int16Dtype()
        },
        'int32col': {
            'vals': [1, 2, 3],
            'pd_type': pd.Int32Dtype()
        },
        'int64col': {
            'vals': [1, 2, 3],
            'pd_type': pd.Int64Dtype()
        },
        'stringcol': {
            'vals': ['one', 'two', 'three'],
            'pd_type': pd.StringDtype()
        },
        'boolcol': {
            'vals': [True, False, True],
            'pd_type': pd.BooleanDtype()
        }
    }
    pdv1_df = pd.DataFrame({
        col_name: col_meta['vals']
        for col_name, col_meta in pdv1_test_mapping.items()
    })

    pdv1_df = pdv1_df.astype({
        col_name: col_meta['pd_type']
        for col_name, col_meta in pdv1_test_mapping.items()
    })
    return pdv1_df
Example #28
0
    def test_to_table_nullable(self):
        boolean_array = pd.array([True, False, None], dtype=pd.BooleanDtype())
        int8_array = pd.array([1, 2, None], dtype=pd.Int8Dtype())
        int16_array = pd.array([1, 2, None], dtype=pd.Int16Dtype())
        int32_array = pd.array([1, 2, None], dtype=pd.Int32Dtype())
        int64_array = pd.array([1, 2, None], dtype=pd.Int64Dtype())
        float_array = pd.array([1.1, 2.2, None], dtype=pd.Float32Dtype())
        double_array = pd.array([1.1, 2.2, None], dtype=pd.Float64Dtype())
        string_array = pd.array(["s11", "s22", None], dtype=pd.StringDtype())
        object_array = pd.array([pd.NA, "s22", None], dtype=object)

        df = pd.DataFrame({
            "NullableBoolean": boolean_array,
            "NullableInt8": int8_array,
            "NullableInt16": int16_array,
            "NullableInt32": int32_array,
            "NullableInt64": int64_array,
            "NullableFloat": float_array,
            "NullableDouble": double_array,
            "NullableString": string_array,
            "NullableObject": object_array,
        })

        table = to_table(df)
        self.assertIs(table.columns[0].data_type, dtypes.bool_)
        self.assertIs(table.columns[1].data_type, dtypes.int8)
        self.assertIs(table.columns[2].data_type, dtypes.int16)
        self.assertIs(table.columns[3].data_type, dtypes.int32)
        self.assertIs(table.columns[4].data_type, dtypes.int64)
        self.assertIs(table.columns[5].data_type, dtypes.float32)
        self.assertIs(table.columns[6].data_type, dtypes.double)
        self.assertIs(table.columns[7].data_type, dtypes.string)
        self.assertIs(table.columns[8].data_type, dtypes.PyObject)
        self.assertEqual(table.size, 3)
        table_string = table.to_string()
        self.assertEqual(9, table_string.count("null"))
Example #29
0
def j_type_to_py_type(t):
    typeclass = t.getTypeClass()
    typeclass_name = typeclass.getName()
    if typeclass_name in [
            'java.lang.Double', 'java.lang.Float', 'double', 'float'
    ]:
        return np.float64
    elif typeclass_name in [
            'java.lang.Long', 'java.lang.Integer', 'int', 'long'
    ]:
        return pd.Int64Dtype()
    elif typeclass_name == 'java.lang.String':
        return pd.StringDtype()
    elif typeclass_name == 'java.sql.Timestamp':
        return np.datetime64
    elif typeclass_name == "com.alibaba.alink.common.linalg.Vector" or typeclass_name == "com.alibaba.alink.common.linalg.DenseVector" or typeclass_name == "com.alibaba.alink.common.linalg.SparseVector":
        return pd.StringDtype()
    elif typeclass_name in ["java.lang.Boolean", 'boolean']:
        return pd.BooleanDtype()
    else:
        print(
            "Java type is not supported in Python for automatic conversion of values: %s"
            % typeclass_name)
        return t
Example #30
0
import cudf
from cudf.testing._utils import assert_eq
from cudf.utils.dtypes import (
    pandas_dtypes_to_cudf_dtypes,
    pyarrow_dtypes_to_pandas_dtypes,
)

ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES"

_PANDAS_TO_AVRO_SCHEMA_MAP = {
    cudf.dtype("int8"): "int",
    pd.Int8Dtype(): ["int", "null"],
    pd.Int16Dtype(): ["int", "null"],
    pd.Int32Dtype(): ["int", "null"],
    pd.Int64Dtype(): ["long", "null"],
    pd.BooleanDtype(): ["boolean", "null"],
    pd.StringDtype(): ["string", "null"],
    cudf.dtype("bool_"): "boolean",
    cudf.dtype("int16"): "int",
    cudf.dtype("int32"): "int",
    cudf.dtype("int64"): "long",
    cudf.dtype("O"): "string",
    cudf.dtype("str"): "string",
    cudf.dtype("float32"): "float",
    cudf.dtype("float64"): "double",
    cudf.dtype("<M8[ns]"): {"type": "long", "logicalType": "timestamp-millis"},
    cudf.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"},
    cudf.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"},
}

PANDAS_TO_ORC_TYPES = {