def test_astype(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] self.assert_eq(pser.astype(int), psser.astype(int)) self.assert_eq(pser.astype(float), psser.astype(float)) self.assert_eq(pser.astype(np.float32), psser.astype(np.float32)) self.assert_eq(pser.astype(np.int32), psser.astype(np.int32)) self.assert_eq(pser.astype(np.int16), psser.astype(np.int16)) self.assert_eq(pser.astype(np.int8), psser.astype(np.int8)) self.assert_eq(pser.astype(str), psser.astype(str)) self.assert_eq(pser.astype(bool), psser.astype(bool)) self.assert_eq(pser.astype("category"), psser.astype("category")) cat_type = CategoricalDtype(categories=[2, 1, 3]) self.assert_eq(pser.astype(cat_type), psser.astype(cat_type)) self.assertRaisesRegex( ValueError, "Cannot convert fractions with missing values to integer", lambda: self.float_withnan_psser.astype(int), ) self.assertRaisesRegex( ValueError, "Cannot convert fractions with missing values to integer", lambda: self.float_withnan_psser.astype(np.int32), ) self.assert_eq(self.float_withnan_psser.astype(str), self.float_withnan_psser.astype(str)) self.assert_eq(self.float_withnan_psser.astype(bool), self.float_withnan_psser.astype(bool)) self.assert_eq( self.float_withnan_psser.astype("category"), self.float_withnan_psser.astype("category") ) if extension_object_dtypes_available and extension_float_dtypes_available: pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype()) psser = ps.from_pandas(pser) self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype()))
def test_astype(self): pdf, psdf = self.pdf, self.psdf for col in self.numeric_df_cols: pser, psser = pdf[col], psdf[col] for int_type in [int, np.int32, np.int16, np.int8]: if not pser.hasnans: self.assert_eq(pser.astype(int_type), psser.astype(int_type)) else: self.assertRaisesRegex( ValueError, "Cannot convert %s with missing " "values to integer" % psser._dtype_op.pretty_name, lambda: psser.astype(int_type), ) # TODO(SPARK-37039): the np.nan series.astype(bool) should be True if not pser.hasnans: self.assert_eq(pser.astype(bool), psser.astype(bool)) self.assert_eq(pser.astype(float), psser.astype(float)) self.assert_eq(pser.astype(np.float32), psser.astype(np.float32)) self.assert_eq(pser.astype(str), psser.astype(str)) self.assert_eq(pser.astype("category"), psser.astype("category")) cat_type = CategoricalDtype(categories=[2, 1, 3]) self.assert_eq(pser.astype(cat_type), psser.astype(cat_type)) if extension_object_dtypes_available and extension_float_dtypes_available: pser = pd.Series(pd.Categorical([1.0, 2.0, 3.0]), dtype=pd.Float64Dtype()) psser = ps.from_pandas(pser) self.assert_eq(pser.astype(pd.BooleanDtype()), psser.astype(pd.BooleanDtype()))
def test_boolean_conversion(s, expected_category, bool_map): assert _infer_bool_type(s) == expected_category if pd.isna(s[2]): expected = pd.Series([True, False, pd.NA], dtype=pd.BooleanDtype()) else: expected = pd.Series([True, False, True], dtype=pd.BooleanDtype()) actual = convert_to_bool_series(s, True, bool_map=bool_map) assert_series_equal(expected, actual)
def test_meta(): values = pd.array([True, False, None], dtype="boolean") ds = dd.from_pandas(pd.Series(values), 2) assert ds.dtype == pd.BooleanDtype() dd.utils.assert_eq(ds._meta_nonempty, pd.Series([True, pd.NA], dtype="boolean")) ddf = dd.from_pandas(pd.DataFrame({"A": values}), 2) assert ddf.dtypes["A"] == pd.BooleanDtype() dd.utils.assert_eq( ddf._meta_nonempty, pd.DataFrame({"A": pd.array([True, pd.NA], dtype="boolean")}), )
def test_decide_pandas_dtype(self): base_arr = np.arange(4) arr = base_arr.reshape(2, 2) res = decide_pandas_dtype(arr) self.assertEqual(res, object) class NotCoveredType: pass mapping = { object: object, bool: pd.BooleanDtype(), str: pd.StringDtype(), float: float, int: "Int64", np.int8: "Int8", np.uint16: "UInt16", NotCoveredType: object } for tin, tout in mapping.items(): arr = base_arr.astype(tin) res = decide_pandas_dtype(arr) self.assertEqual(res, tout)
def pyarrow2pandas_extension( # pylint: disable=too-many-branches,too-many-return-statements dtype: pa.DataType, ) -> Optional[pd.api.extensions.ExtensionDtype]: """Pyarrow to Pandas data types conversion.""" if pa.types.is_int8(dtype): return pd.Int8Dtype() if pa.types.is_int16(dtype): return pd.Int16Dtype() if pa.types.is_int32(dtype): return pd.Int32Dtype() if pa.types.is_int64(dtype): return pd.Int64Dtype() if pa.types.is_uint8(dtype): return pd.UInt8Dtype() if pa.types.is_uint16(dtype): return pd.UInt16Dtype() if pa.types.is_uint32(dtype): return pd.UInt32Dtype() if pa.types.is_uint64(dtype): return pd.UInt64Dtype() if pa.types.is_boolean(dtype): return pd.BooleanDtype() if pa.types.is_string(dtype): return pd.StringDtype() return None
def test_pandas_extension_types(): """Test pandas extension data type happy path.""" # pylint: disable=no-member test_params = [ (pd.CategoricalDtype(), pd.Series(["a", "a", "b", "b", "c", "c"], dtype="category"), None), (pd.DatetimeTZDtype(tz='UTC'), pd.Series(pd.date_range(start="20200101", end="20200301"), dtype="datetime64[ns, utc]"), None), (pd.Int64Dtype(), pd.Series(range(10), dtype="Int64"), None), (pd.StringDtype(), pd.Series(["foo", "bar", "baz"], dtype="string"), None), (pd.PeriodDtype(freq='D'), pd.Series(pd.period_range('1/1/2019', '1/1/2020', freq='D')), None), ( pd.SparseDtype("float"), pd.Series(range(100)).where(lambda s: s < 5, other=np.nan).astype("Sparse[float]"), { "nullable": True }, ), (pd.BooleanDtype(), pd.Series([1, 0, 0, 1, 1], dtype="boolean"), None), ( pd.IntervalDtype(subtype="int64"), pd.Series(pd.IntervalIndex.from_breaks([0, 1, 2, 3, 4])), None, ) ] for dtype, data, series_kwargs in test_params: series_kwargs = {} if series_kwargs is None else series_kwargs series_schema = SeriesSchema(pandas_dtype=dtype, **series_kwargs) assert isinstance(series_schema.validate(data), pd.Series)
def test_null_values(self): data = np.array([["dadf", 10.9, -2, False], ["c", 100.9, 1, True], [None, None, None, None]]) df = pd.DataFrame({ "col1": data[:, 0], "col2": data[:, 1], "col3": data[:, 2], "col4": data[:, 3] }) inOp = dataframeToOperator( df, schemaStr='col1 string, col2 double, col3 long, col4 boolean', op_type='batch') res = inOp.collectToDataframe() print(res.dtypes) print(res) self.assertEqual(res.dtypes[0], pd.StringDtype()) self.assertEqual(res.dtypes[1], np.float64) self.assertEqual(res.dtypes[2], pd.Int64Dtype()) self.assertEqual(res.dtypes[3], pd.BooleanDtype()) self.assertTrue(pd.isna(res["col1"][2])) self.assertTrue(pd.isna(res["col2"][2])) self.assertTrue(pd.isna(res["col3"][2])) self.assertTrue(pd.isna(res["col4"][2]))
def test_memory(self): from pyalink.alink.config import g_config g_config["collect_storage_type"] = "memory" schema = "f_string string,f_long long,f_int int,f_double double,f_boolean boolean" op = dataframeToOperator(self.df, schemaStr=schema, op_type="batch") col_names = op.getColNames() col_types = op.getColTypes() self.assertEqual(col_names[0], "f_string") self.assertEqual(col_names[1], "f_long") self.assertEqual(col_names[2], "f_int") self.assertEqual(col_names[3], "f_double") self.assertEqual(col_names[4], "f_boolean") self.assertEqual(col_types[0], "VARCHAR") self.assertEqual(col_types[1], "BIGINT") self.assertEqual(col_types[2], "INT") self.assertEqual(col_types[3], "DOUBLE") self.assertEqual(col_types[4], "BOOLEAN") df2 = op.collectToDataframe() print(df2) print(df2.dtypes) self.assertEqual(df2['f_string'].dtype, pd.StringDtype()) self.assertEqual(df2['f_long'].dtype, pd.Int64Dtype()) self.assertEqual(df2['f_int'].dtype, pd.Int32Dtype()) self.assertEqual(df2['f_double'].dtype, np.float64) self.assertEqual(df2['f_boolean'].dtype, pd.BooleanDtype())
def _dtypes(self, categories=None): """ Implied types of the columns in the schema """ import pandas as pd if self.has_pandas_metadata: md = self.pandas_metadata['columns'] tz = { c['name']: c['metadata']['timezone'] for c in md if (c.get('metadata', {}) or {}).get('timezone', None) } else: tz = None self.tz = tz categories = self.check_categories(categories) dtype = OrderedDict( (name, (converted_types.typemap(f) if f.num_children in [None, 0] else np.dtype("O"))) for name, f in self.schema.root.children.items() if getattr(f, 'isflat', False) is False) for i, (col, dt) in enumerate(dtype.copy().items()): if dt.kind in ['i', 'b']: # int/bool columns that may have nulls become float columns num_nulls = 0 for rg in self.row_groups: chunk = rg.columns[i] if chunk.meta_data.statistics is None: num_nulls = True break if chunk.meta_data.statistics.null_count is None: num_nulls = True break if chunk.meta_data.statistics.null_count: num_nulls = True break if num_nulls: if dt.kind == "b": dtype[col] = pd.BooleanDtype() elif dtype[col].itemsize == 1: dtype[col] = pd.Int8Dtype() elif dtype[col].itemsize == 2: dtype[col] = pd.Int16Dtype() elif dtype[col].itemsize == 4: dtype[col] = pd.Int32Dtype() else: dtype[col] = pd.Int64Dtype() elif dt.kind == "M": if tz is not None and tz.get(col, False): dtype[col] = pd.Series([], dtype='M8[ns]').dt.tz_localize( tz[col]).dtype elif dt == 'S12': dtype[col] = 'M8[ns]' for field in categories: dtype[field] = 'category' for cat in self.cats: dtype[cat] = "category" self.dtypes = dtype return dtype
def test_bucketing_parquet_dataset(path, glue_database, glue_table, bucketing_data, dtype): # Skip invalid combinations of data and data types if type(bucketing_data[0]) == int and "int" not in dtype.lower(): pytest.skip() if type(bucketing_data[0]) == bool and "bool" not in dtype.lower(): pytest.skip() if type(bucketing_data[0]) == str and (dtype != "string" or dtype != "object"): pytest.skip() nb_of_buckets = 2 df = pd.DataFrame({"c0": bucketing_data, "c1": ["foo", "bar", "baz"]}) r = wr.s3.to_parquet( df=df, path=path, database=glue_database, table=glue_table, dataset=True, mode="overwrite", bucketing_info=(["c0"], nb_of_buckets), ) assert len(r["paths"]) == 2 assert r["paths"][0].endswith("bucket-00000.snappy.parquet") assert r["paths"][1].endswith("bucket-00001.snappy.parquet") dtype = None if isinstance(bucketing_data[0], int): dtype = pd.Int64Dtype() if isinstance(bucketing_data[0], bool): dtype = pd.BooleanDtype() if isinstance(bucketing_data[0], str): dtype = pd.StringDtype() first_bucket_df = wr.s3.read_parquet(path=[r["paths"][0]]) assert len(first_bucket_df) == 2 assert pd.Series([bucketing_data[0], bucketing_data[2]], dtype=dtype).equals(first_bucket_df["c0"]) assert pd.Series(["foo", "baz"], dtype=pd.StringDtype()).equals(first_bucket_df["c1"]) second_bucket_df = wr.s3.read_parquet(path=[r["paths"][1]]) assert len(second_bucket_df) == 1 assert pd.Series([bucketing_data[1]], dtype=dtype).equals(second_bucket_df["c0"]) assert pd.Series(["bar"], dtype=pd.StringDtype()).equals(second_bucket_df["c1"]) loaded_dfs = [ wr.s3.read_parquet(path=path), wr.athena.read_sql_table(table=glue_table, database=glue_database, ctas_approach=False), ] for loaded_df in loaded_dfs: assert len(loaded_df) == 3 assert all(x in bucketing_data for x in loaded_df["c0"].to_list())
def test_nullable_types(self): df = pd.DataFrame({"tag1": [10, None], "tag2": [True, None]}) df1 = df.astype({"tag1": "Int64", "tag2": pd.BooleanDtype()}) push("test/pandas/nullable_types", df1, encoder=DataFrameEncoder(index=False)) df2 = pull("test/pandas/nullable_types") map(lambda x, y: self.assertEqual(x, y), zip(df2.dtypes, df1.dtypes))
def test_astype_to_boolean_array(): # astype to BooleanArray arr = pd.array([True, False, None], dtype="boolean") result = arr.astype("boolean") tm.assert_extension_array_equal(result, arr) result = arr.astype(pd.BooleanDtype()) tm.assert_extension_array_equal(result, arr)
def test_astype_to_boolean_array(): # astype to BooleanArray arr = pd.array([0.0, 1.0, None], dtype="Float64") result = arr.astype("boolean") expected = pd.array([False, True, None], dtype="boolean") tm.assert_extension_array_equal(result, expected) result = arr.astype(pd.BooleanDtype()) tm.assert_extension_array_equal(result, expected)
def __init__(self, pandas_obj): # validate and assign object self._validate(pandas_obj) self._obj = pandas_obj # define incorporated modules - columns consisting of others will not have the dtype changed self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas'] # define a possible list of null values self._NULL_VALS = [ None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf, '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk', 'UNKNOWN', 'UNK' ] # assign dtypes and limits # boolean BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on'] BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off'] self._BOOL_MAP_DICT = {i: True for i in BOOL_STRINGS_TRUE }.update({i: False for i in BOOL_STRINGS_FALSE}) self._DTYPE_BOOL_BASE = np.bool self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype() # unsigned integers - base and nullable self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64] self._DTYPES_UINT_NULLABLE = [ pd.UInt8Dtype(), pd.UInt16Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype() ] self._LIMIT_LOW_UINT = [ np.iinfo(i).min for i in self._DTYPES_UINT_BASE ] self._LIMIT_HIGH_UINT = [ np.iinfo(i).max for i in self._DTYPES_UINT_BASE ] # signed integers - base and nullable self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64] self._DTYPES_INT_NULLABLE = [ pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.Int64Dtype() ] self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE] self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE] # floats - nullable by default self._DTYPES_FLOAT = [np.float16, np.float32, np.float64] # datetime - nullable by default self._DTYPE_DATETIME = np.datetime64 # string self._DTYPE_STRING = pd.StringDtype() # categorical - nullable by default self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
def load_audit(name): df = load_csv(name) df = df.where((pandas.notnull(df)), None) df["Adjusted"] = df["Adjusted"].astype(int) df["Age"] = df["Age"].astype(pandas.Int64Dtype() if name.endswith("NA") else int) df["Deductions"] = df["Deductions"].astype(pandas.BooleanDtype() if name.endswith("NA") else bool) df["Income"] = df["Income"].astype(float) df["Hours"] = df["Hours"].astype(float) return split_csv(df)
def test_dtypes_are_correct_after_groupby_last(self, data): # GH46409 df = DataFrame({ "id": [1, 2, 3, 4], "test": [True, pd.NA, data, False] }).convert_dtypes() result = df.groupby("id").last().test expected = df.set_index("id").test assert result.dtype == pd.BooleanDtype() tm.assert_series_equal(expected, result)
def get_sparse_series() -> Dict[str, pd.Series]: test_series = { "int_sparse": pd.Series([-1, 0, 1, 2, 3], dtype=pd.SparseDtype(np.int32, 0)), "float_sparse": pd.Series( [np.nan, 0, 1, 2, 3], dtype=pd.SparseDtype(np.float64, np.nan), ), "complex_sparse": pd.Series( [ np.nan, complex(0, 1), complex(1, -1), complex(2, 4), complex(3, -12) ], dtype=pd.SparseDtype(np.complex128, np.nan), ), "bool_sparse": pd.Series( [True, False, False], dtype=pd.SparseDtype(np.bool, False), ), "str_obj_sparse": pd.Series( pd.arrays.SparseArray([None, None, "gold", "black", "silver"]), ), # Pending https://github.com/pandas-dev/pandas/issues/35762 # pd.Series([NoneT, 0, 1, 2, 3, 4], name="datetime_sparse", dtype=pd.SparseDtype(np.datetime64)), # Pandas dtypes "pd_int64_sparse": pd.Series( [0, 1, 2, 3, None], dtype=pd.SparseDtype(pd.Int64Dtype()), ), # Pending https://github.com/pandas-dev/pandas/issues/35793 # pd.Series( # ["a", "b", "c", None], # name="pd_categorical_sparse", # dtype=pd.SparseDtype(pd.CategoricalDtype(['a', 'b', 'c', 'd'])) # ) } if pandas_version[0] >= 1 and not_pandas_1_0_5: test_series["pd_string_sparse"] = pd.Series( ["Patty", "Valentine", "Upper", "", "", ""], dtype=pd.SparseDtype(pd.StringDtype(), ""), ) test_series["pd_bool_sparse"] = pd.Series( [True, False, False, None], dtype=pd.SparseDtype(pd.BooleanDtype(), None), ) return test_series
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ): kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: if LooseVersion(self.api.__version__) >= "0.16": import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), } to_pandas_kwargs["types_mapper"] = mapping.get else: raise ValueError( "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " f"({self.api.__version__} is installed" ) manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs ).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def test_is_bool_dtype(): assert not com.is_bool_dtype(int) assert not com.is_bool_dtype(str) assert not com.is_bool_dtype(pd.Series([1, 2])) assert not com.is_bool_dtype(np.array(["a", "b"])) assert not com.is_bool_dtype(pd.Index(["a", "b"])) assert com.is_bool_dtype(bool) assert com.is_bool_dtype(np.bool) assert com.is_bool_dtype(np.array([True, False])) assert com.is_bool_dtype(pd.Index([True, False])) assert com.is_bool_dtype(pd.BooleanDtype()) assert com.is_bool_dtype(pd.array([True, False, None], dtype="boolean"))
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), self.api.float32(): pd.Float32Dtype(), self.api.float64(): pd.Float64Dtype(), } to_pandas_kwargs["types_mapper"] = mapping.get manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def test_numeric_nullable_dtypes(self): dtypes = [ pd.StringDtype(), pd.BooleanDtype(), pd.Float64Dtype(), pd.Float32Dtype(), pd.Int64Dtype(), pd.UInt64Dtype(), pd.Int32Dtype(), pd.UInt32Dtype(), pd.Int16Dtype(), pd.UInt16Dtype(), pd.Int8Dtype(), pd.UInt8Dtype(), pd.StringDtype(), ] # TODO: Re-add (".xml", "xml"), # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46 for suffix, fn in [ (".snappy", "parquet"), (".feather", "feather"), (".csv", "csv"), (".tsv", "tsv"), (".json", "json"), (".xlsx", "xlsx"), (".xls", "xls"), (".xlsb", "xlsb"), (".ods", "ods"), (".pickle", "pickle"), ]: # TODO: include xml for dtype in dtypes: with tmpfile(suffix) as path: try: df = Ind2Col2.convert( Ind2Col2( sample_data_ind2_col2_pd_na())).astype(dtype) assert list(df.index.names) == ["qqq", "rrr"] assert list(df.columns) == ["abc", "xyz"] getattr(df, "to_" + fn)(path) df2 = getattr(Ind2Col2, "read_" + fn)(path) assert list(df2.index.names) == ["qqq", "rrr"] assert list(df2.columns) == ["abc", "xyz"] except Exception: logger.error(f"Failed on path {path}, dtype {dtype}") raise
def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype): if pd_dtype == pd.StringDtype(): data = ["a", pd.NA, "c", pd.NA, "e"] elif pd_dtype == pd.BooleanDtype(): data = [True, pd.NA, False, pd.NA, True] else: data = [1, pd.NA, 3, pd.NA, 5] pd_data = pd.Series(data, dtype=pd_dtype) gd_data = cudf.Series.from_pandas(pd_data) assert gd_data.dtype == expect_dtype # check mask expect_mask = [True if x is not pd.NA else False for x in pd_data] got_mask = mask_to_bools(gd_data._column.base_mask, 0, len(gd_data)).to_array() np.testing.assert_array_equal(expect_mask, got_mask)
def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): if pd_dtype == pd.StringDtype(): data = ["a", pd.NA, "c", pd.NA, "e"] elif pd_dtype == pd.BooleanDtype(): data = [True, pd.NA, False, pd.NA, True] else: data = [1, pd.NA, 3, pd.NA, 5] pd_data = pd.DataFrame.from_dict({"a": data}, dtype=pd_dtype) gd_data = cudf.DataFrame.from_pandas(pd_data) assert gd_data["a"].dtype == expect_dtype # check mask expect_mask = [True if x is not pd.NA else False for x in pd_data["a"]] got_mask = mask_to_bools( gd_data["a"]._column.base_mask, 0, len(gd_data) ).values_host np.testing.assert_array_equal(expect_mask, got_mask)
def test_orc_read_skiprows(tmpdir): buff = BytesIO() df = pd.DataFrame( {"a": [1, 0, 1, 0, None, 1, 1, 1, 0, None, 0, 0, 1, 1, 1, 1]}, dtype=pd.BooleanDtype(), ) writer = pyorc.Writer(buff, pyorc.Struct(a=pyorc.Boolean())) tuples = list( map( lambda x: (None, ) if x[0] is pd.NA else x, list(df.itertuples(index=False, name=None)), )) writer.writerows(tuples) writer.close() skiprows = 10 expected = cudf.read_orc(buff)[skiprows::].reset_index(drop=True) got = cudf.read_orc(buff, skiprows=skiprows) assert_eq(expected, got)
def get_dummy_deltas(employees_path): employees = pd.read_csv(employees_path, nrows=10) # Create deleted flag employees["record_deleted"] = False employees["record_deleted"] = employees["record_deleted"].astype( pd.BooleanDtype()) # Cast to new int cols for col in ["employee_id", "department_id", "manager_id"]: employees[col] = employees[col].astype(pd.Int64Dtype()) # Cast to new str cols for col in ["sex", "forename", "surname"]: employees[col] = employees[col].astype(pd.StringDtype()) # Let's split up the data and make some changes day1 = employees[employees.employee_id.isin([1, 2, 3, 4, 5])].reset_index(drop=True) day2 = employees[employees.employee_id.isin([5, 6, 7])].reset_index(drop=True) day2.loc[0, "department_id"] = 2 day2.loc[0, "manager_id"] = 18 day3 = employees[employees.employee_id.isin([1, 7, 9, 10, 11])].reset_index(drop=True) day3.department_id = 2 day3.manager_id = 5 # Reset this persons values for clarity day3.loc[0, "record_deleted"] = True day3.loc[0, "department_id"] = 1 day3.loc[0, "manager_id"] = 17 deltas = {"day1": day1, "day2": day2, "day3": day3} return deltas
def test_df_pdv1_types(): pdv1_test_mapping = { 'int8col': { 'vals': [1, 2, 3], 'pd_type': pd.Int8Dtype() }, 'int16col': { 'vals': [1, 2, 3], 'pd_type': pd.Int16Dtype() }, 'int32col': { 'vals': [1, 2, 3], 'pd_type': pd.Int32Dtype() }, 'int64col': { 'vals': [1, 2, 3], 'pd_type': pd.Int64Dtype() }, 'stringcol': { 'vals': ['one', 'two', 'three'], 'pd_type': pd.StringDtype() }, 'boolcol': { 'vals': [True, False, True], 'pd_type': pd.BooleanDtype() } } pdv1_df = pd.DataFrame({ col_name: col_meta['vals'] for col_name, col_meta in pdv1_test_mapping.items() }) pdv1_df = pdv1_df.astype({ col_name: col_meta['pd_type'] for col_name, col_meta in pdv1_test_mapping.items() }) return pdv1_df
def test_to_table_nullable(self): boolean_array = pd.array([True, False, None], dtype=pd.BooleanDtype()) int8_array = pd.array([1, 2, None], dtype=pd.Int8Dtype()) int16_array = pd.array([1, 2, None], dtype=pd.Int16Dtype()) int32_array = pd.array([1, 2, None], dtype=pd.Int32Dtype()) int64_array = pd.array([1, 2, None], dtype=pd.Int64Dtype()) float_array = pd.array([1.1, 2.2, None], dtype=pd.Float32Dtype()) double_array = pd.array([1.1, 2.2, None], dtype=pd.Float64Dtype()) string_array = pd.array(["s11", "s22", None], dtype=pd.StringDtype()) object_array = pd.array([pd.NA, "s22", None], dtype=object) df = pd.DataFrame({ "NullableBoolean": boolean_array, "NullableInt8": int8_array, "NullableInt16": int16_array, "NullableInt32": int32_array, "NullableInt64": int64_array, "NullableFloat": float_array, "NullableDouble": double_array, "NullableString": string_array, "NullableObject": object_array, }) table = to_table(df) self.assertIs(table.columns[0].data_type, dtypes.bool_) self.assertIs(table.columns[1].data_type, dtypes.int8) self.assertIs(table.columns[2].data_type, dtypes.int16) self.assertIs(table.columns[3].data_type, dtypes.int32) self.assertIs(table.columns[4].data_type, dtypes.int64) self.assertIs(table.columns[5].data_type, dtypes.float32) self.assertIs(table.columns[6].data_type, dtypes.double) self.assertIs(table.columns[7].data_type, dtypes.string) self.assertIs(table.columns[8].data_type, dtypes.PyObject) self.assertEqual(table.size, 3) table_string = table.to_string() self.assertEqual(9, table_string.count("null"))
def j_type_to_py_type(t): typeclass = t.getTypeClass() typeclass_name = typeclass.getName() if typeclass_name in [ 'java.lang.Double', 'java.lang.Float', 'double', 'float' ]: return np.float64 elif typeclass_name in [ 'java.lang.Long', 'java.lang.Integer', 'int', 'long' ]: return pd.Int64Dtype() elif typeclass_name == 'java.lang.String': return pd.StringDtype() elif typeclass_name == 'java.sql.Timestamp': return np.datetime64 elif typeclass_name == "com.alibaba.alink.common.linalg.Vector" or typeclass_name == "com.alibaba.alink.common.linalg.DenseVector" or typeclass_name == "com.alibaba.alink.common.linalg.SparseVector": return pd.StringDtype() elif typeclass_name in ["java.lang.Boolean", 'boolean']: return pd.BooleanDtype() else: print( "Java type is not supported in Python for automatic conversion of values: %s" % typeclass_name) return t
import cudf from cudf.testing._utils import assert_eq from cudf.utils.dtypes import ( pandas_dtypes_to_cudf_dtypes, pyarrow_dtypes_to_pandas_dtypes, ) ALL_POSSIBLE_VALUES = "ALL_POSSIBLE_VALUES" _PANDAS_TO_AVRO_SCHEMA_MAP = { cudf.dtype("int8"): "int", pd.Int8Dtype(): ["int", "null"], pd.Int16Dtype(): ["int", "null"], pd.Int32Dtype(): ["int", "null"], pd.Int64Dtype(): ["long", "null"], pd.BooleanDtype(): ["boolean", "null"], pd.StringDtype(): ["string", "null"], cudf.dtype("bool_"): "boolean", cudf.dtype("int16"): "int", cudf.dtype("int32"): "int", cudf.dtype("int64"): "long", cudf.dtype("O"): "string", cudf.dtype("str"): "string", cudf.dtype("float32"): "float", cudf.dtype("float64"): "double", cudf.dtype("<M8[ns]"): {"type": "long", "logicalType": "timestamp-millis"}, cudf.dtype("<M8[ms]"): {"type": "long", "logicalType": "timestamp-millis"}, cudf.dtype("<M8[us]"): {"type": "long", "logicalType": "timestamp-micros"}, } PANDAS_TO_ORC_TYPES = {