def test_lahman_batting_teams(lahman_batting, lahman_teams): """Verify Lahman batting aggregated to the team level matches Lahman teams.""" exclude = ['lg_id', 'team_id', 'year', 'g'] key = ['team_id', 'year'] cols = set( lahman_batting.columns) & set(lahman_teams.columns) - set(exclude) cols = list(cols) assert len(cols) == 12 # work-around for Pandas 1.0.1 bugs # sum does not up-cast for nullable integer types # select_dtypes does not distinguish between nullable and non-nullable int types idx = lahman_batting[cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()]) for col in lahman_batting[cols].columns[idx]: lahman_batting[col] = lahman_batting[col].astype('Int32') idx = lahman_teams[cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()]) for col in lahman_teams[cols].columns[idx]: lahman_teams[col] = lahman_teams[col].astype('Int32') b = lahman_batting[key + cols].groupby(key).agg('sum').reset_index() t = lahman_teams[key + cols].sort_values(key).reset_index(drop=True) # ensure the dtypes are the same for col in t.columns: if not col == 'team_id' and not col == 'year': b[col] = b[col].astype('int') t[col] = t[col].astype('int') assert b[cols].equals(t[cols])
def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame: """ Calculates the average score of a subject and adds a new "score" column with it. If the subject has more than "maximal_nans_per_sub" NaN in his grades, the score should be NA. Otherwise, the score is simply the mean of the other grades. The datatype of score is UInt8, and the floating point raw numbers should be rounded down. Parameters ---------- maximal_nans_per_sub : int, optional Number of allowed NaNs per subject before giving a NA score. Returns ------- pd.DataFrame A new DF with a new column - "score". """ new_data = self.data new_data["score"] = "" for index, row in new_data.iterrows(): row_list = row[['q1', 'q2', 'q3', 'q4', 'q5']] row_list = row_list.tolist() if row_list.count("nan") > maximal_nans_per_sub: new_data.loc[index, "score"] = pd.NA else: mean = row[['q1', 'q2', 'q3', 'q4', 'q5']].astype(float).mean() new_data.loc[index, "score"] = np.uint8(mean) new_data["score"] = new_data["score"].astype(pd.UInt8Dtype()) return new_data
def coerce_not_float_cols_nans(cls, self): """Coerce cols with floats and nans to the correct integer dtype.""" cols = self.not_float_cols_nans int8_val = 127 int16_val = 32767 int32_val = 2147483648 for col in cols: min = self.df[col].min() max = self.df[col].max() if min >= 0: if max < 255: self.df[col] = self.df[col].astype(pd.UInt8Dtype()) elif max < 65535: self.df[col] = self.df[col].astype(pd.UInt16Dtype()) elif max < 4294967295: self.df[col] = self.df[col].astype(pd.UInt32Dtype()) else: if min > -int8_val and max < int8_val: self.df[col] = self.df[col].astype(pd.Int8Dtype()) elif min > -int16_val and max < int16_val: self.df[col] = self.df[col].astype(pd.Int16Dtype()) elif min > -int32_val and max < int32_val: self.df[col] = self.df[col].astype(pd.Int32Dtype())
def integer_type_mapping( use_extension_types: bool) -> Mapping[IntegerType, DtypeObj]: if use_extension_types: return { IntegerType.INT8: pd.Int8Dtype(), IntegerType.UINT8: pd.UInt8Dtype(), IntegerType.INT16: pd.Int16Dtype(), IntegerType.UINT16: pd.UInt16Dtype(), IntegerType.INT24: pd.Int32Dtype(), IntegerType.UINT24: pd.Int32Dtype(), IntegerType.INT32: pd.Int32Dtype(), IntegerType.UINT32: pd.UInt32Dtype(), IntegerType.INT64: pd.Int64Dtype(), IntegerType.UINT64: pd.UInt64Dtype(), } else: return { IntegerType.INT8: np.int8, IntegerType.UINT8: np.uint8, IntegerType.INT16: np.int16, IntegerType.UINT16: np.uint16, IntegerType.INT24: np.int32, IntegerType.UINT24: np.uint32, IntegerType.INT32: np.int32, IntegerType.UINT32: np.uint32, IntegerType.INT64: np.int64, IntegerType.UINT64: np.uint64, }
def pyarrow2pandas_extension( # pylint: disable=too-many-branches,too-many-return-statements dtype: pa.DataType, ) -> Optional[pd.api.extensions.ExtensionDtype]: """Pyarrow to Pandas data types conversion.""" if pa.types.is_int8(dtype): return pd.Int8Dtype() if pa.types.is_int16(dtype): return pd.Int16Dtype() if pa.types.is_int32(dtype): return pd.Int32Dtype() if pa.types.is_int64(dtype): return pd.Int64Dtype() if pa.types.is_uint8(dtype): return pd.UInt8Dtype() if pa.types.is_uint16(dtype): return pd.UInt16Dtype() if pa.types.is_uint32(dtype): return pd.UInt32Dtype() if pa.types.is_uint64(dtype): return pd.UInt64Dtype() if pa.types.is_boolean(dtype): return pd.BooleanDtype() if pa.types.is_string(dtype): return pd.StringDtype() return None
def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame: """Calculates the average score of a subject and adds a new "score" column with it. If the subject has more than "maximal_nans_per_sub" NaN in his grades, the score should be NA. Otherwise, the score is simply the mean of the other grades. The datatype of score is UInt8, and the floating point raw numbers should be rounded down. Parameters ---------- maximal_nans_per_sub : int, optional Number of allowed NaNs per subject before giving a NA score. Returns ------- pd.DataFrame A new DF with a new column - "score". """ df = self.data.copy() grades = df.iloc[:,-5:] df["score"] = grades.mean(axis = 1).apply(np.floor).astype(pd.UInt8Dtype()) null_grades = pd.isnull(grades).sum(axis = 1) df.loc[null_grades > maximal_nans_per_sub, "score"] = pd.NA return df
def __init__(self, pandas_obj): # validate and assign object self._validate(pandas_obj) self._obj = pandas_obj # define incorporated modules - columns consisting of others will not have the dtype changed self._INCORPORATED_MODULES = ['builtins', 'numpy', 'pandas'] # define a possible list of null values self._NULL_VALS = [ None, np.nan, 'np.nan', 'nan', np.inf, 'np.inf', 'inf', -np.inf, '-np.inf', '', 'n/a', 'na', 'N/A', 'NA', 'unknown', 'unk', 'UNKNOWN', 'UNK' ] # assign dtypes and limits # boolean BOOL_STRINGS_TRUE = ['t', 'true', 'yes', 'on'] BOOL_STRINGS_FALSE = ['f', 'false', 'no', 'off'] self._BOOL_MAP_DICT = {i: True for i in BOOL_STRINGS_TRUE }.update({i: False for i in BOOL_STRINGS_FALSE}) self._DTYPE_BOOL_BASE = np.bool self._DTYPE_BOOL_NULLABLE = pd.BooleanDtype() # unsigned integers - base and nullable self._DTYPES_UINT_BASE = [np.uint8, np.uint16, np.uint32, np.uint64] self._DTYPES_UINT_NULLABLE = [ pd.UInt8Dtype(), pd.UInt16Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype() ] self._LIMIT_LOW_UINT = [ np.iinfo(i).min for i in self._DTYPES_UINT_BASE ] self._LIMIT_HIGH_UINT = [ np.iinfo(i).max for i in self._DTYPES_UINT_BASE ] # signed integers - base and nullable self._DTYPES_INT_BASE = [np.int8, np.int16, np.int32, np.int64] self._DTYPES_INT_NULLABLE = [ pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.Int64Dtype() ] self._LIMIT_LOW_INT = [np.iinfo(i).min for i in self._DTYPES_INT_BASE] self._LIMIT_HIGH_INT = [np.iinfo(i).max for i in self._DTYPES_INT_BASE] # floats - nullable by default self._DTYPES_FLOAT = [np.float16, np.float32, np.float64] # datetime - nullable by default self._DTYPE_DATETIME = np.datetime64 # string self._DTYPE_STRING = pd.StringDtype() # categorical - nullable by default self._DTYPE_CATEGORICAL = pd.CategoricalDtype()
def add_freq_notes_df(sid, ssfs_df): real_freqs = {freq: freq * sid.freq_scaler for freq in ssfs_df['freq1'].unique() if pd.notna(freq)} closest_notes = {real_freq: closest_midi(real_freq)[1] for real_freq in real_freqs.values()} freq_map = [(freq, real_freq, closest_notes[real_freq]) for freq, real_freq in real_freqs.items()] freq_map.extend([(pd.NA, pd.NA, pd.NA)]) freq_notes_df = pd.DataFrame.from_records(freq_map, columns=['freq1', 'real_freq', 'closest_note']).astype(pd.Float64Dtype()) freq_notes_df['freq1'] = freq_notes_df['freq1'].astype(pd.UInt16Dtype()) freq_notes_df['closest_note'] = freq_notes_df['closest_note'].astype(pd.UInt8Dtype()) return set_sid_dtype(ssfs_df).merge(freq_notes_df, how='left', on='freq1')
def test_intdtypes() -> None: pd.Int8Dtype() pd.Int16Dtype() pd.Int32Dtype() pd.Int64Dtype() pd.UInt8Dtype() pd.UInt16Dtype() pd.UInt32Dtype() pd.UInt64Dtype()
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ): kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: if LooseVersion(self.api.__version__) >= "0.16": import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), } to_pandas_kwargs["types_mapper"] = mapping.get else: raise ValueError( "'use_nullable_dtypes=True' is only supported for pyarrow >= 0.16 " f"({self.api.__version__} is installed" ) manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs ).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def set_sid_dtype(df): df.dtype = pd.UInt64Dtype() for col in df.columns: if col.startswith('freq') or col.startswith( 'pwduty') or col == 'fltcoff': col_type = pd.UInt16Dtype() elif col[-1].isdigit() or col.startswith('flt'): col_type = pd.UInt8Dtype() else: continue df[col] = df[col].astype(col_type) return df
def test_numeric_dtypes(self): dtypes = [ bool, np.byte, np.ubyte, np.short, np.ushort, np.single, np.int32, np.intc, np.half, np.float16, np.double, np.float64, pd.StringDtype(), pd.Int64Dtype(), pd.UInt64Dtype(), pd.Int32Dtype(), pd.UInt32Dtype(), pd.Int16Dtype(), pd.UInt16Dtype(), pd.Int8Dtype(), pd.UInt8Dtype(), ] for suffix, fn in [ (".snappy", "parquet"), (".feather", "feather"), (".xml", "xml"), (".csv", "csv"), (".tsv", "tsv"), (".json", "json"), (".xlsx", "xlsx"), (".xls", "xls"), (".xlsb", "xlsb"), (".ods", "ods"), (".pickle", "pickle"), ]: with tmpfile(suffix) as path: for dtype in dtypes: try: df = Ind2Col2.convert(Ind2Col2( sample_data_ind2_col2())).astype(dtype) assert list(df.index.names) == ["qqq", "rrr"] assert list(df.columns) == ["abc", "xyz"] getattr(df, "to_" + fn)(path) df2 = getattr(Ind2Col2, "read_" + fn)(path) assert list(df2.index.names) == ["qqq", "rrr"] assert list(df2.columns) == ["abc", "xyz"] except Exception: logger.error(f"Failed on path {path}, dtype {dtype}") raise
def test_lahman_retro_fielding_data(fielding, lahman_fielding): """Compare Aggregated Lahman fielding per position data to Aggregated Retrosheet fielding per position data.""" # find the common columns f_cols = set(lahman_fielding.columns) & set(fielding.columns) f_cols -= {'player_id', 'pos', 'team_id', 'year'} f_cols = list(f_cols) # work-around for Pandas 1.0.1 bugs # sum does not up-cast for nullable integer types # select_dtypes does not distinguish between nullable and non-nullable int types idx = lahman_fielding[f_cols].dtypes.isin( [pd.UInt8Dtype(), pd.UInt16Dtype()]) for col in lahman_fielding[f_cols].columns[idx]: lahman_fielding[col] = lahman_fielding[col].astype('Int32') l_sums = lahman_fielding.groupby('pos')[f_cols].agg('sum') l_sums.sort_index(inplace=True) # there are 7 fielding attributes and 7 fielding positions in Lahman assert l_sums.shape == (7, 7) r_sums = fielding.groupby('pos')[f_cols].agg('sum').astype('int') # Lahman uses OF for sum of LF, CF, RF r_sums.loc['OF'] = r_sums.loc['LF'] + r_sums.loc['CF'] + r_sums.loc['RF'] r_sums = r_sums.drop(['LF', 'CF', 'RF']) r_sums.sort_index(inplace=True) # there are now 7 fielding attributes and 7 fielding positions in Retrosheet sums assert r_sums.shape == (7, 7) # the indexes and columns should now be the same assert l_sums.index.equals(r_sums.index) assert l_sums.columns.equals(r_sums.columns) filt = fielding['pos'].isin(['LF', 'CF', 'RF']) r_of = fielding[filt] # account for outfielders who played more than 1 outfield position in the same game total_dups = r_of.duplicated(subset=['player_id', 'game_id'], keep=False).sum() counted_dups = r_of.duplicated(subset=['player_id', 'game_id'], keep='first').sum() r_sums.loc['OF', 'g'] -= (total_dups - counted_dups) rel_accuarcy = l_sums / r_sums # relative accuracy is within 0.8% for all 49 aggregated values assert (np.abs(1.0 - rel_accuarcy) < 0.008).all().all()
def read( self, path, columns=None, use_nullable_dtypes=False, storage_options: StorageOptions = None, **kwargs, ) -> DataFrame: kwargs["use_pandas_metadata"] = True to_pandas_kwargs = {} if use_nullable_dtypes: import pandas as pd mapping = { self.api.int8(): pd.Int8Dtype(), self.api.int16(): pd.Int16Dtype(), self.api.int32(): pd.Int32Dtype(), self.api.int64(): pd.Int64Dtype(), self.api.uint8(): pd.UInt8Dtype(), self.api.uint16(): pd.UInt16Dtype(), self.api.uint32(): pd.UInt32Dtype(), self.api.uint64(): pd.UInt64Dtype(), self.api.bool_(): pd.BooleanDtype(), self.api.string(): pd.StringDtype(), self.api.float32(): pd.Float32Dtype(), self.api.float64(): pd.Float64Dtype(), } to_pandas_kwargs["types_mapper"] = mapping.get manager = get_option("mode.data_manager") if manager == "array": to_pandas_kwargs["split_blocks"] = True # type: ignore[assignment] path_or_handle, handles, kwargs["filesystem"] = _get_path_or_handle( path, kwargs.pop("filesystem", None), storage_options=storage_options, mode="rb", ) try: result = self.api.parquet.read_table( path_or_handle, columns=columns, **kwargs).to_pandas(**to_pandas_kwargs) if manager == "array": result = result._as_manager("array", copy=False) return result finally: if handles is not None: handles.close()
def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame: df = self.data values = [] for index, row in df.iterrows(): count = 0 for i in range(1,6): if math.isnan(row['q{0}'.format(i)]): count +=1 if count > maximal_nans_per_sub: values.append(None) else: values.append(int(np.nanmean([row['q1'],row['q2'],row['q3'],row['q4'],row['q5']]))) df['score'] = values df['score'] = df['score'].astype(pd.UInt8Dtype()) self.data = df return self.data
def test_numeric_nullable_dtypes(self): dtypes = [ pd.StringDtype(), pd.BooleanDtype(), pd.Float64Dtype(), pd.Float32Dtype(), pd.Int64Dtype(), pd.UInt64Dtype(), pd.Int32Dtype(), pd.UInt32Dtype(), pd.Int16Dtype(), pd.UInt16Dtype(), pd.Int8Dtype(), pd.UInt8Dtype(), pd.StringDtype(), ] # TODO: Re-add (".xml", "xml"), # TODO: See https://github.com/dmyersturnbull/typed-dfs/issues/46 for suffix, fn in [ (".snappy", "parquet"), (".feather", "feather"), (".csv", "csv"), (".tsv", "tsv"), (".json", "json"), (".xlsx", "xlsx"), (".xls", "xls"), (".xlsb", "xlsb"), (".ods", "ods"), (".pickle", "pickle"), ]: # TODO: include xml for dtype in dtypes: with tmpfile(suffix) as path: try: df = Ind2Col2.convert( Ind2Col2( sample_data_ind2_col2_pd_na())).astype(dtype) assert list(df.index.names) == ["qqq", "rrr"] assert list(df.columns) == ["abc", "xyz"] getattr(df, "to_" + fn)(path) df2 = getattr(Ind2Col2, "read_" + fn)(path) assert list(df2.index.names) == ["qqq", "rrr"] assert list(df2.columns) == ["abc", "xyz"] except Exception: logger.error(f"Failed on path {path}, dtype {dtype}") raise
def test_to_pandas_dtype_integer_nullable(): expectations = { (-100, 100): pd.Int8Dtype(), (0, 240): pd.UInt8Dtype(), (-10000, 10000): pd.Int16Dtype(), (500, 40000): pd.UInt16Dtype(), (-200000000, 200000000): pd.Int32Dtype(), (25, 4000000000): pd.UInt32Dtype(), (-9000000000000000000, 2000000000): pd.Int64Dtype(), (25, 10000000000000000000): pd.UInt64Dtype(), (25, 1000000000000000000000000000): np.float128, (None, None): pd.Int64Dtype(), } for (min_, max_), expected_pandas_type in expectations.items(): constraints = RecordsSchemaFieldIntegerConstraints(required=True, unique=None, min_=min_, max_=max_) yield with_nullable( True, check_dtype), "integer", constraints, expected_pandas_type
def score_subjects(self, maximal_nans_per_sub: int = 1) -> pd.DataFrame: """Calculates the average score of a subject and adds a new "score" column with it. If the subject has more than "maximal_nans_per_sub" NaN in his grades, the score should be NA. Otherwise, the score is simply the mean of the other grades. The datatype of score is UInt8, and the floating point raw numbers should be rounded down. Parameters ---------- maximal_nans_per_sub : int, optional Number of allowed NaNs per subject before giving a NA score. Returns ------- pd.DataFrame A new DF with a new column - "score". """ data_df = self.read_data() # Put the data into a data frame questions_ans = data_df.loc[:, 'q1':'q5'] # Indexes of the wanted columns number_of_nan = questions_ans.isnull().sum(axis=1).tolist() floored_mean_score = np.floor( questions_ans.mean(axis=1)) #.astype(pd.UInt8Dtype())) for idx, num in enumerate(number_of_nan): if num > maximal_nans_per_sub: floored_mean_score[idx] = None else: continue data_df['score'] = floored_mean_score.astype(pd.UInt8Dtype()) return data_df
# Copyright (c) 2020, NVIDIA CORPORATION. import random import pandas as pd import pyarrow as pa pyarrow_dtypes_to_pandas_dtypes = { pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(), pa.uint64(): pd.UInt64Dtype(), pa.int8(): pd.Int8Dtype(), pa.int16(): pd.Int16Dtype(), pa.int32(): pd.Int32Dtype(), pa.int64(): pd.Int64Dtype(), pa.bool_(): pd.BooleanDtype(), pa.string(): pd.StringDtype(), } def _generate_rand_meta(obj, dtypes_list): obj._current_params = {} num_rows = obj._rand(obj._max_rows) num_cols = obj._rand(obj._max_columns) dtypes_meta = [] for _ in range(num_cols): dtype = random.choice(dtypes_list) null_frequency = random.uniform(0, 1)
sliced_sr[3] = None assert sliced_sr.memory_usage() == 80 sr = cudf.Series(["hello world", "rapids ai", "abc", "z"]) assert sr.memory_usage() == 44 assert sr[3:].memory_usage() == 9 # z assert sr[:1].memory_usage() == 19 # hello world @pytest.mark.parametrize( "sr,expected_psr", [ ( cudf.Series([1, 2, None, 3], dtype="uint8"), pd.Series([1, 2, None, 3], dtype=pd.UInt8Dtype()), ), ( cudf.Series([23, None, None, 32], dtype="uint16"), pd.Series([23, None, None, 32], dtype=pd.UInt16Dtype()), ), ( cudf.Series([None, 123, None, 1], dtype="uint32"), pd.Series([None, 123, None, 1], dtype=pd.UInt32Dtype()), ), ( cudf.Series([234, 2323, 23432, None, None, 224], dtype="uint64"), pd.Series([234, 2323, 23432, None, None, 224], dtype=pd.UInt64Dtype()), ), (
def pandas_type_casting(df): import numpy as np import pandas as pd global n # df = pd.read_csv("users-isprep.zip") old = df.memory_usage() / 1024/1024 #numeric cols number_cols = list(df.select_dtypes("number").columns) n = df[number_cols].fillna(0).agg([min,max]).T.add_suffix("_") get_cols_names(0, 255, pd.UInt8Dtype()) get_cols_names(256, 65535, pd.UInt16Dtype()) get_cols_names(65536, 4294967295, pd.UInt32Dtype()) get_cols_names(-128, 127, pd.Int8Dtype()) get_cols_names(-32768, 32767, pd.Int16Dtype()) get_cols_names(-2147483648, 2147483647, pd.Int32Dtype()) # date and catagorical datacols catagoriacal_cols = list(df.select_dtypes("O").columns) date_cols = [] for i in catagoriacal_cols: x = df[i][~df[i].isna()].head() try: pd.to_datetime(x) date_cols.append(i) except: pass catagoriacal_cols = [i for i in catagoriacal_cols if not i in date_cols] c = df[catagoriacal_cols].apply(lambda x:x.nunique()/len(df)*100) for i in c[c<5].index: d[i] = "category" # del df # df = pd.read_csv("users-isprep.zip", parse_dates=date_cols, dtype=d) for i in d: df[i] = df[i].astype(d[i]) new = df.memory_usage() / 1024/1024 m = pd.DataFrame({"new" : new, "old" : old, "Imporovement" : old - new}) m['Dtype'] = [None] + list(df[list(new.index.drop("Index"))].dtypes.astype(str).values) c = df[catagoriacal_cols].apply(lambda x:x.nunique()/len(df)*100) m["nunique"] = None m.loc[c.index, "nunique"] = list(df[c.index].apply(lambda x:x.nunique() / len(df) * 100).values) print("Before :", round(m.old.sum())) print("After :", round(m.new.sum())) print("Diff :", round(m.Imporovement.sum())) print("Diff % :", round(m.Imporovement.sum()/m.old.sum(), 2)) print("\n\nImprovement:") print(m.groupby("Dtype").Imporovement.agg([min, max, sum, np.mean, np.median, "count"])) print("\n\nDetailed Summary:") print(m.to_string()) return df
'float16': (parquet_thrift.Type.FLOAT, None, 16), } revmap = { parquet_thrift.Type.INT32: np.int32, parquet_thrift.Type.INT64: np.int64, parquet_thrift.Type.FLOAT: np.float32, parquet_thrift.Type.DOUBLE: np.float64 } pdoptional_to_numpy_typemap = { pd.Int8Dtype(): np.int8, pd.Int16Dtype(): np.int16, pd.Int32Dtype(): np.int32, pd.Int64Dtype(): np.int64, pd.UInt8Dtype(): np.uint8, pd.UInt16Dtype(): np.uint16, pd.UInt32Dtype(): np.uint32, pd.UInt64Dtype(): np.uint64, pd.BooleanDtype(): np.bool } def find_type(data, fixed_text=None, object_encoding=None, times='int64'): """ Get appropriate typecodes for column dtype Data conversion do not happen here, see convert(). The user is expected to transform their data into the appropriate dtype before saving to parquet, we will not make any assumptions for them.
np.int64: SqlTypeName.BIGINT, pd.Int64Dtype(): SqlTypeName.BIGINT, np.int32: SqlTypeName.INTEGER, pd.Int32Dtype(): SqlTypeName.INTEGER, np.int16: SqlTypeName.SMALLINT, pd.Int16Dtype(): SqlTypeName.SMALLINT, np.int8: SqlTypeName.TINYINT, pd.Int8Dtype(): SqlTypeName.TINYINT, np.uint64: SqlTypeName.BIGINT, pd.UInt64Dtype(): SqlTypeName.BIGINT, np.uint32: SqlTypeName.INTEGER, pd.UInt32Dtype(): SqlTypeName.INTEGER, np.uint16: SqlTypeName.SMALLINT, pd.UInt16Dtype(): SqlTypeName.SMALLINT, np.uint8: SqlTypeName.TINYINT, pd.UInt8Dtype(): SqlTypeName.TINYINT, np.bool8: SqlTypeName.BOOLEAN, pd.BooleanDtype(): SqlTypeName.BOOLEAN, np.object_: SqlTypeName.VARCHAR, pd.StringDtype(): SqlTypeName.VARCHAR, np.datetime64: SqlTypeName.TIMESTAMP, } # Default mapping between SQL types and python types # for values _SQL_TO_PYTHON_SCALARS = { "DOUBLE": np.float64, "FLOAT": np.float32, "DECIMAL": np.float32, "BIGINT": np.int64, "INTEGER": np.int32,
np.longlong: pa.int64(), np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(), np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(), np.dtype("object"): pd.StringDtype(), } pyarrow_dtypes_to_pandas_dtypes = { pa.uint8(): pd.UInt8Dtype(), pa.uint16(): pd.UInt16Dtype(), pa.uint32(): pd.UInt32Dtype(),
class UINT8(UINT16): """Semantic representation of a :class:`pandas.UInt8Dtype`.""" type = pd.UInt8Dtype() bit_width: int = 8
"""Semantic representation of a :class:`pandas.UInt32Dtype`.""" type = pd.UInt32Dtype() bit_width: int = 32 @Engine.register_dtype(equivalents=[pd.UInt16Dtype, pd.UInt16Dtype()]) @immutable class UINT16(UINT32): """Semantic representation of a :class:`pandas.UInt16Dtype`.""" type = pd.UInt16Dtype() bit_width: int = 16 @Engine.register_dtype(equivalents=[pd.UInt8Dtype, pd.UInt8Dtype()]) @immutable class UINT8(UINT16): """Semantic representation of a :class:`pandas.UInt8Dtype`.""" type = pd.UInt8Dtype() bit_width: int = 8 # ############################################################################### # # float # ############################################################################### _register_numpy_numbers( builtin_name="float", pandera_name="Float",
np.longlong: pa.int64(), np.int32: pa.int32(), np.int16: pa.int16(), np.int8: pa.int8(), np.bool_: pa.int8(), np.uint64: pa.uint64(), np.uint32: pa.uint32(), np.uint16: pa.uint16(), np.uint8: pa.uint8(), np.datetime64: pa.date64(), np.object_: pa.string(), np.str_: pa.string(), } cudf_dtypes_to_pandas_dtypes = { np.dtype("uint8"): pd.UInt8Dtype(), np.dtype("uint16"): pd.UInt16Dtype(), np.dtype("uint32"): pd.UInt32Dtype(), np.dtype("uint64"): pd.UInt64Dtype(), np.dtype("int8"): pd.Int8Dtype(), np.dtype("int16"): pd.Int16Dtype(), np.dtype("int32"): pd.Int32Dtype(), np.dtype("int64"): pd.Int64Dtype(), np.dtype("bool_"): pd.BooleanDtype(), np.dtype("object"): pd.StringDtype(), } SIGNED_INTEGER_TYPES = {"int8", "int16", "int32", "int64"} UNSIGNED_TYPES = {"uint8", "uint16", "uint32", "uint64"} INTEGER_TYPES = SIGNED_INTEGER_TYPES | UNSIGNED_TYPES FLOAT_TYPES = {"float32", "float64"}
parquet_thrift.ConvertedType.INT_8: np.dtype("int8"), parquet_thrift.ConvertedType.INT_16: np.dtype("int16"), parquet_thrift.ConvertedType.INT_32: np.dtype('int32'), parquet_thrift.ConvertedType.INT_64: np.dtype('int64'), parquet_thrift.ConvertedType.TIME_MILLIS: np.dtype('<m8[ns]'), parquet_thrift.ConvertedType.DATE: np.dtype('<M8[ns]'), parquet_thrift.ConvertedType.TIMESTAMP_MILLIS: np.dtype('<M8[ns]'), parquet_thrift.ConvertedType.TIME_MICROS: np.dtype('<m8[ns]'), parquet_thrift.ConvertedType.TIMESTAMP_MICROS: np.dtype('<M8[ns]') } nullable = { np.dtype('int8'): pd.Int8Dtype(), np.dtype('int16'): pd.Int16Dtype(), np.dtype('int32'): pd.Int32Dtype(), np.dtype('int64'): pd.Int64Dtype(), np.dtype('uint8'): pd.UInt8Dtype(), np.dtype('uint16'): pd.UInt16Dtype(), np.dtype('uint32'): pd.UInt32Dtype(), np.dtype('uint64'): pd.UInt64Dtype(), np.dtype('bool'): pd.BooleanDtype() } pandas_nullable = { "Int8": pd.Int8Dtype(), "Int16": pd.Int16Dtype(), "Int32": pd.Int32Dtype(), "Int64": pd.Int64Dtype(), "UInt8": pd.UInt8Dtype(), "UInt16": pd.UInt16Dtype(), "UInt32": pd.UInt32Dtype(), "UInt64": pd.UInt64Dtype(), "boolean": pd.BooleanDtype()
"int64": "Int64", "int32": "Int32", "int16": "Int16", "int8": "Int8", "datetime64[D]": "Date", "datetime64[ns]": "DateTime", } PD2CH = keymap(np.dtype, MAPPING) PD_INT_TYPES = [ pd.Int8Dtype(), pd.Int16Dtype(), pd.Int32Dtype(), pd.Int64Dtype(), pd.UInt8Dtype(), pd.UInt16Dtype(), pd.UInt32Dtype(), pd.UInt64Dtype(), ] for typ in PD_INT_TYPES: PD2CH[typ] = f"Nullable({typ.name})" CH2PD = itemmap(reversed, MAPPING) CH2PD["Null"] = "object" CH2PD["Nothing"] = "object" NULLABLE_COLS = [ "Float64", "Float32",
([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]], {}, {},), ], ) def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs): pyarrow_data = pa.array(data, **pyarrow_kwargs) cudf_from_pyarrow = as_column(pyarrow_data) expected = as_column(data, **cudf_kwargs) assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected)) @pytest.mark.parametrize( "pd_dtype,expect_dtype", [ # TODO: Nullable float is coming (pd.StringDtype(), np.dtype("O")), (pd.UInt8Dtype(), np.dtype("uint8")), (pd.UInt16Dtype(), np.dtype("uint16")), (pd.UInt32Dtype(), np.dtype("uint32")), (pd.UInt64Dtype(), np.dtype("uint64")), (pd.Int8Dtype(), np.dtype("int8")), (pd.Int16Dtype(), np.dtype("int16")), (pd.Int32Dtype(), np.dtype("int32")), (pd.Int64Dtype(), np.dtype("int64")), (pd.BooleanDtype(), np.dtype("bool")), ], ) def test_build_df_from_nullable_pandas_dtype(pd_dtype, expect_dtype): if pd_dtype == pd.StringDtype(): data = ["a", pd.NA, "c", pd.NA, "e"] elif pd_dtype == pd.BooleanDtype(): data = [True, pd.NA, False, pd.NA, True]